mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-01 10:14:59 +00:00
feat: Option to compress audio files by ~90%
Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text
This commit is contained in:
parent
f3ad683d70
commit
1cc30a22f9
6 changed files with 32 additions and 4 deletions
|
@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root):
|
||||||
help="Use VI editing mode in the terminal (default: False)",
|
help="Use VI editing mode in the terminal (default: False)",
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--voice-format",
|
||||||
|
metavar="VOICE_FORMAT",
|
||||||
|
default="wav",
|
||||||
|
choices=["wav", "mp3", "webm"],
|
||||||
|
help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg",
|
||||||
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
"--voice-language",
|
"--voice-language",
|
||||||
metavar="VOICE_LANGUAGE",
|
metavar="VOICE_LANGUAGE",
|
||||||
|
|
|
@ -997,7 +997,7 @@ class Commands:
|
||||||
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
|
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
self.voice = voice.Voice()
|
self.voice = voice.Voice(audio_format=self.args.voice_format)
|
||||||
except voice.SoundDeviceError:
|
except voice.SoundDeviceError:
|
||||||
self.io.tool_error(
|
self.io.tool_error(
|
||||||
"Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"
|
"Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"
|
||||||
|
|
|
@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
|
||||||
sf = None
|
sf = None
|
||||||
|
|
||||||
from prompt_toolkit.shortcuts import prompt
|
from prompt_toolkit.shortcuts import prompt
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
from .dump import dump # noqa: F401
|
from .dump import dump # noqa: F401
|
||||||
|
|
||||||
|
@ -27,7 +28,7 @@ class Voice:
|
||||||
|
|
||||||
threshold = 0.15
|
threshold = 0.15
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, audio_format="wav"):
|
||||||
if sf is None:
|
if sf is None:
|
||||||
raise SoundDeviceError
|
raise SoundDeviceError
|
||||||
try:
|
try:
|
||||||
|
@ -37,6 +38,9 @@ class Voice:
|
||||||
self.sd = sd
|
self.sd = sd
|
||||||
except (OSError, ModuleNotFoundError):
|
except (OSError, ModuleNotFoundError):
|
||||||
raise SoundDeviceError
|
raise SoundDeviceError
|
||||||
|
if audio_format not in ["wav", "mp3", "webm"]:
|
||||||
|
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||||
|
self.audio_format = audio_format
|
||||||
|
|
||||||
def callback(self, indata, frames, time, status):
|
def callback(self, indata, frames, time, status):
|
||||||
"""This is called (from a separate thread) for each audio block."""
|
"""This is called (from a separate thread) for each audio block."""
|
||||||
|
@ -80,7 +84,7 @@ class Voice:
|
||||||
def raw_record_and_transcribe(self, history, language):
|
def raw_record_and_transcribe(self, history, language):
|
||||||
self.q = queue.Queue()
|
self.q = queue.Queue()
|
||||||
|
|
||||||
filename = tempfile.mktemp(suffix=".wav")
|
temp_wav = tempfile.mktemp(suffix=".wav")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
|
sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
|
||||||
|
@ -99,10 +103,18 @@ class Voice:
|
||||||
except self.sd.PortAudioError as err:
|
except self.sd.PortAudioError as err:
|
||||||
raise SoundDeviceError(f"Error accessing audio input device: {err}")
|
raise SoundDeviceError(f"Error accessing audio input device: {err}")
|
||||||
|
|
||||||
with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
|
with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
|
||||||
while not self.q.empty():
|
while not self.q.empty():
|
||||||
file.write(self.q.get())
|
file.write(self.q.get())
|
||||||
|
|
||||||
|
if self.audio_format != "wav":
|
||||||
|
filename = tempfile.mktemp(suffix=f".{self.audio_format}")
|
||||||
|
audio = AudioSegment.from_wav(temp_wav)
|
||||||
|
audio.export(filename, format=self.audio_format)
|
||||||
|
os.remove(temp_wav)
|
||||||
|
else:
|
||||||
|
filename = temp_wav
|
||||||
|
|
||||||
with open(filename, "rb") as fh:
|
with open(filename, "rb") as fh:
|
||||||
try:
|
try:
|
||||||
transcript = litellm.transcription(
|
transcript = litellm.transcription(
|
||||||
|
@ -112,6 +124,9 @@ class Voice:
|
||||||
print(f"Unable to transcribe {filename}: {err}")
|
print(f"Unable to transcribe {filename}: {err}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if self.audio_format != "wav":
|
||||||
|
os.remove(filename)
|
||||||
|
|
||||||
text = transcript.text
|
text = transcript.text
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
@ -242,6 +242,9 @@
|
||||||
## Use VI editing mode in the terminal (default: False)
|
## Use VI editing mode in the terminal (default: False)
|
||||||
#vim: false
|
#vim: false
|
||||||
|
|
||||||
|
## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg
|
||||||
|
#voice-format: wav
|
||||||
|
|
||||||
## Specify the language for voice using ISO 639-1 code (default: auto)
|
## Specify the language for voice using ISO 639-1 code (default: auto)
|
||||||
#voice-language: en
|
#voice-language: en
|
||||||
|
|
||||||
|
|
|
@ -144,6 +144,8 @@ pydantic==2.9.2
|
||||||
# openai
|
# openai
|
||||||
pydantic-core==2.23.4
|
pydantic-core==2.23.4
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
pydub==0.25.1
|
||||||
|
# via -r requirements/requirements.in
|
||||||
pyflakes==3.2.0
|
pyflakes==3.2.0
|
||||||
# via flake8
|
# via flake8
|
||||||
pygments==2.18.0
|
pygments==2.18.0
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# pip-compile requirements.in --upgrade
|
# pip-compile requirements.in --upgrade
|
||||||
#
|
#
|
||||||
|
|
||||||
|
pydub
|
||||||
configargparse
|
configargparse
|
||||||
GitPython
|
GitPython
|
||||||
jsonschema
|
jsonschema
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue