From 1cc30a22f9df7be3208f951f5c113431809cc957 Mon Sep 17 00:00:00 2001
From: Mike Bailey <mike@failmode.com>
Date: Sun, 22 Sep 2024 18:19:22 +1000
Subject: [PATCH] feat: Option to compress audio files by ~90%

Add option to reduce bandwidth (and potentially latency) by converting
voice recordings (wav) into a compressed audio format (webm or mp3).

Default behaviour is unchanged.

> File uploads are currently limited to 25 MB and the following input file
> types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
>
> - https://platform.openai.com/docs/guides/speech-to-text
---
 aider/args.py                              |  7 +++++++
 aider/commands.py                          |  2 +-
 aider/voice.py                             | 21 ++++++++++++++++++---
 aider/website/assets/sample.aider.conf.yml |  3 +++
 requirements.txt                           |  2 ++
 requirements/requirements.in               |  1 +
 6 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/aider/args.py b/aider/args.py
index b67aeb43a..4b9c745ae 100644
--- a/aider/args.py
+++ b/aider/args.py
@@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root):
         help="Use VI editing mode in the terminal (default: False)",
         default=False,
     )
+    group.add_argument(
+        "--voice-format",
+        metavar="VOICE_FORMAT",
+        default="wav",
+        choices=["wav", "mp3", "webm"],
+        help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg",
+    )
     group.add_argument(
         "--voice-language",
         metavar="VOICE_LANGUAGE",
diff --git a/aider/commands.py b/aider/commands.py
index e6035ad10..40daf2719 100644
--- a/aider/commands.py
+++ b/aider/commands.py
@@ -997,7 +997,7 @@ class Commands:
                 self.io.tool_error("To use /voice you must provide an OpenAI API key.")
                 return
             try:
-                self.voice = voice.Voice()
+                self.voice = voice.Voice(audio_format=self.args.voice_format)
             except voice.SoundDeviceError:
                 self.io.tool_error(
                     "Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"
diff --git a/aider/voice.py b/aider/voice.py
index 047a0174d..1334a0a7e 100644
--- a/aider/voice.py
+++ b/aider/voice.py
@@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
     sf = None
 
 from prompt_toolkit.shortcuts import prompt
+from pydub import AudioSegment
 
 from .dump import dump  # noqa: F401
 
@@ -27,7 +28,7 @@ class Voice:
 
     threshold = 0.15
 
-    def __init__(self):
+    def __init__(self, audio_format="wav"):
         if sf is None:
             raise SoundDeviceError
         try:
@@ -37,6 +38,9 @@ class Voice:
             self.sd = sd
         except (OSError, ModuleNotFoundError):
             raise SoundDeviceError
+        if audio_format not in ["wav", "mp3", "webm"]:
+            raise ValueError(f"Unsupported audio format: {audio_format}")
+        self.audio_format = audio_format
 
     def callback(self, indata, frames, time, status):
         """This is called (from a separate thread) for each audio block."""
@@ -80,7 +84,7 @@ class Voice:
     def raw_record_and_transcribe(self, history, language):
         self.q = queue.Queue()
 
-        filename = tempfile.mktemp(suffix=".wav")
+        temp_wav = tempfile.mktemp(suffix=".wav")
 
         try:
             sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
@@ -99,10 +103,18 @@ class Voice:
         except self.sd.PortAudioError as err:
             raise SoundDeviceError(f"Error accessing audio input device: {err}")
 
-        with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
+        with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
             while not self.q.empty():
                 file.write(self.q.get())
 
+        if self.audio_format != "wav":
+            filename = tempfile.mktemp(suffix=f".{self.audio_format}")
+            audio = AudioSegment.from_wav(temp_wav)
+            audio.export(filename, format=self.audio_format)
+            os.remove(temp_wav)
+        else:
+            filename = temp_wav
+
         with open(filename, "rb") as fh:
             try:
                 transcript = litellm.transcription(
@@ -112,6 +124,9 @@ class Voice:
                 print(f"Unable to transcribe {filename}: {err}")
                 return
 
+        if self.audio_format != "wav":
+            os.remove(filename)
+
         text = transcript.text
         return text
 
diff --git a/aider/website/assets/sample.aider.conf.yml b/aider/website/assets/sample.aider.conf.yml
index 9b9b46077..921285de7 100644
--- a/aider/website/assets/sample.aider.conf.yml
+++ b/aider/website/assets/sample.aider.conf.yml
@@ -242,6 +242,9 @@
 ## Use VI editing mode in the terminal (default: False)
 #vim: false
 
+## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg
+#voice-format: wav
+
 ## Specify the language for voice using ISO 639-1 code (default: auto)
 #voice-language: en
 
diff --git a/requirements.txt b/requirements.txt
index ab8871925..4bd231778 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -144,6 +144,8 @@ pydantic==2.9.2
     #   openai
 pydantic-core==2.23.4
     # via pydantic
+pydub==0.25.1
+    # via -r requirements/requirements.in
 pyflakes==3.2.0
     # via flake8
 pygments==2.18.0
diff --git a/requirements/requirements.in b/requirements/requirements.in
index 031adb5e2..aeda8cde1 100644
--- a/requirements/requirements.in
+++ b/requirements/requirements.in
@@ -2,6 +2,7 @@
 #    pip-compile requirements.in --upgrade
 #
 
+pydub
 configargparse
 GitPython
 jsonschema