feat: Fixed random audio errors with soundcard and pydub

robotsasfurniture · Nov 11, 2024 · b1c2525 · b1c2525
1 parent a6d5606
commit b1c2525
Show file tree

Hide file tree

Showing 9 changed files with 198 additions and 49 deletions.
diff --git a/packages/passive_sound_localization/passive_sound_localization/localization.py b/packages/passive_sound_localization/passive_sound_localization/localization.py
@@ -1,6 +1,6 @@
 from typing import List, Iterator, Optional, Tuple
-from passive_sound_localization.models.configs.localization import LocalizationConfig
-# from models.configs.localization import LocalizationConfig # Only needed to run with `realtime_audio.py`
+# from passive_sound_localization.models.configs.localization import LocalizationConfig
+from models.configs.localization import LocalizationConfig # Only needed to run with `realtime_audio.py`
 from dataclasses import dataclass
 import numpy as np
 import logging

diff --git a/packages/passive_sound_localization/passive_sound_localization/models/configs/__init__.py b/packages/passive_sound_localization/passive_sound_localization/models/configs/__init__.py
@@ -1,30 +1,30 @@
-from passive_sound_localization.models.configs.localization import (
-    LocalizationConfig,
-)
-from passive_sound_localization.models.configs.logging import (
-    LoggingConfig,
-)
-from passive_sound_localization.models.configs.feature_flags import (
-    FeatureFlagsConfig,
-)
-
-from passive_sound_localization.models.configs.openai_websocket import (
-    OpenAIWebsocketConfig,
-)
-
-from passive_sound_localization.models.configs.realtime_streamer import RealtimeAudioStreamerConfig
-
-# from models.configs.localization import (
+# from passive_sound_localization.models.configs.localization import (
 #     LocalizationConfig,
-# ) # Need import paths like this to test audio streaming with `realtime_audio.py`
-# from models.configs.logging import (
+# )
+# from passive_sound_localization.models.configs.logging import (
 #     LoggingConfig,
-# ) # Need import paths like this to test audio streaming with `realtime_audio.py`
-# from models.configs.feature_flags import (
+# )
+# from passive_sound_localization.models.configs.feature_flags import (
 #     FeatureFlagsConfig,
-# ) # Need import paths like this to test audio streaming with `realtime_audio.py`
-# from models.configs.realtime_streamer import RealtimeAudioStreamerConfig # Need import paths like this to test audio streaming with `realtime_audio.py`
-# from models.configs.openai_websocket import OpenAIWebsocketConfig # Need import paths like this to test audio streaming with `realtime_audio.py`
+# )
+
+# from passive_sound_localization.models.configs.openai_websocket import (
+#     OpenAIWebsocketConfig,
+# )
+
+# from passive_sound_localization.models.configs.realtime_streamer import RealtimeAudioStreamerConfig
+
+from models.configs.localization import (
+    LocalizationConfig,
+) # Need import paths like this to test audio streaming with `realtime_audio.py`
+from models.configs.logging import (
+    LoggingConfig,
+) # Need import paths like this to test audio streaming with `realtime_audio.py`
+from models.configs.feature_flags import (
+    FeatureFlagsConfig,
+) # Need import paths like this to test audio streaming with `realtime_audio.py`
+from models.configs.realtime_streamer import RealtimeAudioStreamerConfig # Need import paths like this to test audio streaming with `realtime_audio.py`
+from models.configs.openai_websocket import OpenAIWebsocketConfig # Need import paths like this to test audio streaming with `realtime_audio.py`
 
 
 from dataclasses import dataclass, field

diff --git a/...ages/passive_sound_localization/passive_sound_localization/models/configs/localization.py b/...ages/passive_sound_localization/passive_sound_localization/models/configs/localization.py
@@ -5,7 +5,7 @@
 @dataclass(frozen=True)
 class LocalizationConfig:
     speed_of_sound: float = 343.0  # Speed of sound in m/s
-    sample_rate: int = 24000  # Sample rate of the audio in Hz
+    sample_rate: int = 44100  # Sample rate of the audio in Hz
     fft_size: int = 1024  # Size of FFT to use
 
     mic_positions: List[List[float]] = field(

diff --git a/...passive_sound_localization/passive_sound_localization/models/configs/realtime_streamer.py b/...passive_sound_localization/passive_sound_localization/models/configs/realtime_streamer.py
@@ -3,9 +3,10 @@
 
 @dataclass(frozen=True)
 class RealtimeAudioStreamerConfig:
-    sample_rate: int = 24000
+    sample_rate: int = 44100
     channels: int = 1
     chunk: int = 1024
-    device_indices: List[int] = field(default_factory=lambda: [2, 3, 4, 5]) # Lab configuration
+    # device_indices: List[int] = field(default_factory=lambda: [2, 3, 4, 5]) # Lab configuration
+    device_indices: List[int] = field(default_factory=lambda: [1, 2, 3, 4]) # Nico's laptop (configuration with soundcard)
     # device_indices: List[int] = field(default_factory=lambda: [4, 6, 8, 10]) # Nico's laptop (configuration 1)
     # device_indices: List[int] = field(default_factory=lambda: [2, 4, 6, 8]) # Nico's laptop (configuration 2)
diff --git a/packages/passive_sound_localization/passive_sound_localization/realtime_audio.py b/packages/passive_sound_localization/passive_sound_localization/realtime_audio.py
@@ -25,11 +25,13 @@
 def send_audio_continuously(client, single_channel_generator):
     print("Threading...")
     for single_channel_audio in single_channel_generator:
+        if single_channel_audio is None:
+            continue
         client.send_audio(single_channel_audio)
 
 
 def receive_text_messages(client, logger):
-    logger.info("OpanAI: Listening to audio stream")
+    logger.info("OpenAI: Listening to audio stream")
     while True:
         try:
             command = client.receive_text_response()
@@ -44,13 +46,16 @@ def receive_text_messages(client, logger):
                 # commands.append(command)
         except Exception as e:
             print(f"Error receiving response: {e}")
-            break  # Exit loop if server disconnects
 
 def realtime_localization(multi_channel_stream, localizer, logger):
     logger.info("Localization: Listening to audio stream")
     try:
         did_get = True
         for audio_data in multi_channel_stream:
+            # Skip calculating if there's any issues with the audio data
+            if audio_data is None:
+                continue
+
             #  Stream audio data and pass it to the localizer
             localization_stream = localizer.localize_stream(
                 [audio_data[k] for k in audio_data.keys()]
@@ -92,11 +97,8 @@ def command_executor(publisher, logger):
 
 def publish_results(localization_results):
     print(localization_results)
-
-
 
 def main():
-    print("Hello world")
     audio_streamer_config = RealtimeAudioStreamerConfig()
     localizer_config = LocalizationConfig()
     websocket_config = OpenAIWebsocketConfig()

diff --git a/packages/passive_sound_localization/passive_sound_localization/realtime_audio_streamer.py b/packages/passive_sound_localization/passive_sound_localization/realtime_audio_streamer.py
@@ -3,6 +3,9 @@
 import soundcard as sc
 import numpy as np
 import threading
+from pydub import AudioSegment
+from io import BytesIO
+
 
 # from passive_sound_localization.models.configs.realtime_streamer import (
 #     RealtimeAudioStreamerConfig,
@@ -27,7 +30,7 @@ def __init__(self, config: RealtimeAudioStreamerConfig):
         print(self.device_indices)
 
     def __enter__(self):
-        microphones: List[sc._Microphone] = sc.all_microphones()
+        microphones = sc.all_microphones()
         self.streams = {
             device_index: np.zeros((self.chunk, self.channels), dtype=np.float32)
             for device_index in self.device_indices
@@ -42,7 +45,7 @@ def __enter__(self):
 
         return self
 
-    def record_audio(self, microphones: List[sc._Microphone]):
+    def record_audio(self, microphones):
         while self.streaming:
             for device_index in self.device_indices:
                 self.streams[device_index] = microphones[device_index].record(
@@ -58,18 +61,22 @@ def __exit__(self, *args):
     def get_stream(self, device_index: int) -> Optional[bytes]:
         """Retrieve the audio stream for a specific device index."""
         if device_index in self.device_indices:
-            return self.streams[device_index].tobytes()
+            return np.nan_to_num(self.streams[device_index]).tobytes()
         else:
             print(f"Device index {device_index} not found.")
             return None
 
-    def multi_channel_gen(self) -> Generator[Dict[int, bytes], None, None]:
+    def multi_channel_gen(self) -> Generator[Optional[Dict[int, bytes]], None, None]:
         try:
             while self.streaming:
                 audio_arrays = []
                 for device_index in self.device_indices:
                     audio_arrays.append(self.get_stream(device_index))
 
+                # Return none if any audio is None or empty bytes
+                if any(audio == b"" or audio is None for audio in audio_arrays):
+                    yield None
+
                 yield {
                     device_index: audio
                     for device_index, audio in zip(self.device_indices, audio_arrays)
@@ -80,10 +87,31 @@ def multi_channel_gen(self) -> Generator[Dict[int, bytes], None, None]:
 
     def merge_streams(self, streams: List[np.ndarray]) -> np.ndarray:
         return np.sum(streams, axis=0) / len(streams)
+
+    def resample_stream(self, stream: bytes, target_sample_rate: int = 24000, sample_width: int=2) -> bytes:
+        try:
+            audio = AudioSegment.from_file(BytesIO(stream))
 
-    def single_channel_gen(self) -> Generator[bytes, None, None]:
+            # Resample to 24kHz mono pcm16
+            return audio.set_frame_rate(target_sample_rate).set_channels(self.channels).set_sample_width(sample_width).raw_data
+
+        except Exception as e:
+            print(f"Error in resample_stream: {e}")
+            return b""
+
+
+    def single_channel_gen(self) -> Generator[Optional[bytes], None, None]:
         try:
             while self.streaming:
-                yield self.merge_streams(list(self.streams.values())).tobytes()
+                stream = self.get_stream(self.device_indices[0])
+                if stream == b"" or stream is None:
+                    yield None
+
+                resampled_stream = self.resample_stream(stream)
+
+                if resampled_stream != b"":
+                    yield resampled_stream
+                else:
+                    yield None
         except Exception as e:
-            print(f"Error in single_channel_gen: {e}")
+            print(f"Error in single_channel_gen: {e}")
diff --git a/packages/passive_sound_localization/passive_sound_localization/realtime_openai_websocket.py b/packages/passive_sound_localization/passive_sound_localization/realtime_openai_websocket.py
@@ -5,8 +5,8 @@
 import logging
 from typing import Optional
 
-from passive_sound_localization.models.configs.openai_websocket import OpenAIWebsocketConfig
-# from models.configs.openai_websocket import OpenAIWebsocketConfig # Only needed to run with `realtime_audio.py`
+# from passive_sound_localization.models.configs.openai_websocket import OpenAIWebsocketConfig
+from models.configs.openai_websocket import OpenAIWebsocketConfig # Only needed to run with `realtime_audio.py`
 
 logger = logging.getLogger(__name__)
 
@@ -111,24 +111,24 @@ def send_audio(self, audio_chunk: bytes) -> None:
             "audio": audio_b64
         }))
 
-    def receive_text_response(self, timeout:float=0.3) -> str:
+    def receive_text_response(self, timeout:float=5.0) -> str:
         try:
             # Tries to receive the next message (in a blocking manner) from the OpenAI websocket
             # If the message doesn't arrive in 300ms, then it raises a TimeoutError
             message = json.loads(self.ws.recv(timeout=timeout))
         except TimeoutError:
-            return OpenAITimeoutError(timeout=timeout)
+            raise OpenAITimeoutError(timeout=timeout)
 
         # Print message just to see what we're receiving
-        print(message)
+        # print(message)
 
         # Checks to see any general errors
         if message["type"] == "error":
-            return OpenAIWebsocketError(error_code=message["error"]["code"], error_message=["error"]["message"])
+            raise OpenAIWebsocketError(error_code=message["error"]["code"], error_message=["error"]["message"])
 
         # Checks to see whether OpenAI is specifically rate limiting our responses
         if message["type"] == "response.done" and message["response"]["status_details"]["error"]["code"] == "rate_limit_exceeded":
-            return OpenAIRateLimitError()
+            raise OpenAIRateLimitError()
 
         # Checks to see if an actual text response was sent, and returns the text
         if message["type"] == "response.text.done":