feat: Option to compress audio files by ~90%

Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text
2025-05-31 01:35:00 +00:00 · 2024-09-22 18:19:22 +10:00 · 2024-09-22 18:19:22 +10:00 · 1cc30a22f9
commit 1cc30a22f9
parent f3ad683d70
6 changed files with 32 additions and 4 deletions
--- a/aider/args.py
+++ b/aider/args.py
@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root):
        help="Use VI editing mode in the terminal (default: False)",
        default=False,
    )
+    group.add_argument(
+        "--voice-format",
+        metavar="VOICE_FORMAT",
+        default="wav",
+        choices=["wav", "mp3", "webm"],
+        help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg",
+    )
    group.add_argument(
        "--voice-language",
        metavar="VOICE_LANGUAGE",
--- a/aider/commands.py
+++ b/aider/commands.py
@ -997,7 +997,7 @@ class Commands:
                self.io.tool_error("To use /voice you must provide an OpenAI API key.")
                return
            try:
-                self.voice = voice.Voice()
+                self.voice = voice.Voice(audio_format=self.args.voice_format)
            except voice.SoundDeviceError:
                self.io.tool_error(
                    "Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"
--- a/aider/voice.py
+++ b/aider/voice.py
@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
    sf = None

 from prompt_toolkit.shortcuts import prompt
+from pydub import AudioSegment

 from .dump import dump  # noqa: F401

@ -27,7 +28,7 @@ class Voice:

    threshold = 0.15

-    def __init__(self):
+    def __init__(self, audio_format="wav"):
        if sf is None:
            raise SoundDeviceError
        try:
@ -37,6 +38,9 @@ class Voice:
            self.sd = sd
        except (OSError, ModuleNotFoundError):
            raise SoundDeviceError
+        if audio_format not in ["wav", "mp3", "webm"]:
+            raise ValueError(f"Unsupported audio format: {audio_format}")
+        self.audio_format = audio_format

    def callback(self, indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
@ -80,7 +84,7 @@ class Voice:
    def raw_record_and_transcribe(self, history, language):
        self.q = queue.Queue()

-        filename = tempfile.mktemp(suffix=".wav")
+        temp_wav = tempfile.mktemp(suffix=".wav")

        try:
            sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
@ -99,10 +103,18 @@ class Voice:
        except self.sd.PortAudioError as err:
            raise SoundDeviceError(f"Error accessing audio input device: {err}")

-        with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
+        with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
            while not self.q.empty():
                file.write(self.q.get())

+        if self.audio_format != "wav":
+            filename = tempfile.mktemp(suffix=f".{self.audio_format}")
+            audio = AudioSegment.from_wav(temp_wav)
+            audio.export(filename, format=self.audio_format)
+            os.remove(temp_wav)
+        else:
+            filename = temp_wav
+
        with open(filename, "rb") as fh:
            try:
                transcript = litellm.transcription(
@ -112,6 +124,9 @@ class Voice:
                print(f"Unable to transcribe {filename}: {err}")
                return

+        if self.audio_format != "wav":
+            os.remove(filename)
+
        text = transcript.text
        return text

--- a/aider/website/assets/sample.aider.conf.yml
+++ b/aider/website/assets/sample.aider.conf.yml
@ -242,6 +242,9 @@
 ## Use VI editing mode in the terminal (default: False)
 #vim: false

+## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg
+#voice-format: wav
+
 ## Specify the language for voice using ISO 639-1 code (default: auto)
 #voice-language: en

--- a/requirements.txt
+++ b/requirements.txt
@ -144,6 +144,8 @@ pydantic==2.9.2
    #   openai
 pydantic-core==2.23.4
    # via pydantic
+pydub==0.25.1
+    # via -r requirements/requirements.in
 pyflakes==3.2.0
    # via flake8
 pygments==2.18.0
--- a/requirements/requirements.in
+++ b/requirements/requirements.in
@ -2,6 +2,7 @@
 #    pip-compile requirements.in --upgrade
 #

+pydub
 configargparse
 GitPython
 jsonschema