feat: Option to compress audio files by ~90%

Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text
2025-06-04 11:45:00 +00:00 · 2024-09-22 18:19:22 +10:00 · 2024-09-22 18:19:22 +10:00 · 1cc30a22f9
commit 1cc30a22f9
parent f3ad683d70
6 changed files with 32 additions and 4 deletions
--- a/aider/voice.py
+++ b/aider/voice.py
@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
    sf = None

 from prompt_toolkit.shortcuts import prompt
+from pydub import AudioSegment

 from .dump import dump  # noqa: F401

@ -27,7 +28,7 @@ class Voice:

    threshold = 0.15

-    def __init__(self):
+    def __init__(self, audio_format="wav"):
        if sf is None:
            raise SoundDeviceError
        try:
@ -37,6 +38,9 @@ class Voice:
            self.sd = sd
        except (OSError, ModuleNotFoundError):
            raise SoundDeviceError
+        if audio_format not in ["wav", "mp3", "webm"]:
+            raise ValueError(f"Unsupported audio format: {audio_format}")
+        self.audio_format = audio_format

    def callback(self, indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
@ -80,7 +84,7 @@ class Voice:
    def raw_record_and_transcribe(self, history, language):
        self.q = queue.Queue()

-        filename = tempfile.mktemp(suffix=".wav")
+        temp_wav = tempfile.mktemp(suffix=".wav")

        try:
            sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
@ -99,10 +103,18 @@ class Voice:
        except self.sd.PortAudioError as err:
            raise SoundDeviceError(f"Error accessing audio input device: {err}")

-        with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
+        with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
            while not self.q.empty():
                file.write(self.q.get())

+        if self.audio_format != "wav":
+            filename = tempfile.mktemp(suffix=f".{self.audio_format}")
+            audio = AudioSegment.from_wav(temp_wav)
+            audio.export(filename, format=self.audio_format)
+            os.remove(temp_wav)
+        else:
+            filename = temp_wav
+
        with open(filename, "rb") as fh:
            try:
                transcript = litellm.transcription(
@ -112,6 +124,9 @@ class Voice:
                print(f"Unable to transcribe {filename}: {err}")
                return

+        if self.audio_format != "wav":
+            os.remove(filename)
+
        text = transcript.text
        return text