From 1cc30a22f9df7be3208f951f5c113431809cc957 Mon Sep 17 00:00:00 2001 From: Mike Bailey Date: Sun, 22 Sep 2024 18:19:22 +1000 Subject: [PATCH] feat: Option to compress audio files by ~90% Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text --- aider/args.py | 7 +++++++ aider/commands.py | 2 +- aider/voice.py | 21 ++++++++++++++++++--- aider/website/assets/sample.aider.conf.yml | 3 +++ requirements.txt | 2 ++ requirements/requirements.in | 1 + 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/aider/args.py b/aider/args.py index b67aeb43a..4b9c745ae 100644 --- a/aider/args.py +++ b/aider/args.py @@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root): help="Use VI editing mode in the terminal (default: False)", default=False, ) + group.add_argument( + "--voice-format", + metavar="VOICE_FORMAT", + default="wav", + choices=["wav", "mp3", "webm"], + help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg", + ) group.add_argument( "--voice-language", metavar="VOICE_LANGUAGE", diff --git a/aider/commands.py b/aider/commands.py index e6035ad10..40daf2719 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -997,7 +997,7 @@ class Commands: self.io.tool_error("To use /voice you must provide an OpenAI API key.") return try: - self.voice = voice.Voice() + self.voice = voice.Voice(audio_format=self.args.voice_format) except voice.SoundDeviceError: self.io.tool_error( "Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?" diff --git a/aider/voice.py b/aider/voice.py index 047a0174d..1334a0a7e 100644 --- a/aider/voice.py +++ b/aider/voice.py @@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError): sf = None from prompt_toolkit.shortcuts import prompt +from pydub import AudioSegment from .dump import dump # noqa: F401 @@ -27,7 +28,7 @@ class Voice: threshold = 0.15 - def __init__(self): + def __init__(self, audio_format="wav"): if sf is None: raise SoundDeviceError try: @@ -37,6 +38,9 @@ class Voice: self.sd = sd except (OSError, ModuleNotFoundError): raise SoundDeviceError + if audio_format not in ["wav", "mp3", "webm"]: + raise ValueError(f"Unsupported audio format: {audio_format}") + self.audio_format = audio_format def callback(self, indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" @@ -80,7 +84,7 @@ class Voice: def raw_record_and_transcribe(self, history, language): self.q = queue.Queue() - filename = tempfile.mktemp(suffix=".wav") + temp_wav = tempfile.mktemp(suffix=".wav") try: sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"]) @@ -99,10 +103,18 @@ class Voice: except self.sd.PortAudioError as err: raise SoundDeviceError(f"Error accessing audio input device: {err}") - with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file: + with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file: while not self.q.empty(): file.write(self.q.get()) + if self.audio_format != "wav": + filename = tempfile.mktemp(suffix=f".{self.audio_format}") + audio = AudioSegment.from_wav(temp_wav) + audio.export(filename, format=self.audio_format) + os.remove(temp_wav) + else: + filename = temp_wav + with open(filename, "rb") as fh: try: transcript = litellm.transcription( @@ -112,6 +124,9 @@ class Voice: print(f"Unable to transcribe {filename}: {err}") return + if self.audio_format != "wav": + os.remove(filename) + text = transcript.text return text diff --git a/aider/website/assets/sample.aider.conf.yml b/aider/website/assets/sample.aider.conf.yml index 9b9b46077..921285de7 100644 --- a/aider/website/assets/sample.aider.conf.yml +++ b/aider/website/assets/sample.aider.conf.yml @@ -242,6 +242,9 @@ ## Use VI editing mode in the terminal (default: False) #vim: false +## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg +#voice-format: wav + ## Specify the language for voice using ISO 639-1 code (default: auto) #voice-language: en diff --git a/requirements.txt b/requirements.txt index ab8871925..4bd231778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -144,6 +144,8 @@ pydantic==2.9.2 # openai pydantic-core==2.23.4 # via pydantic +pydub==0.25.1 + # via -r requirements/requirements.in pyflakes==3.2.0 # via flake8 pygments==2.18.0 diff --git a/requirements/requirements.in b/requirements/requirements.in index 031adb5e2..aeda8cde1 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -2,6 +2,7 @@ # pip-compile requirements.in --upgrade # +pydub configargparse GitPython jsonschema