mirror of
https://github.com/Aider-AI/aider.git
synced 2025-05-31 01:35:00 +00:00
feat: Option to compress audio files by ~90%
Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text
This commit is contained in:
parent
f3ad683d70
commit
1cc30a22f9
6 changed files with 32 additions and 4 deletions
|
@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root):
|
|||
help="Use VI editing mode in the terminal (default: False)",
|
||||
default=False,
|
||||
)
|
||||
group.add_argument(
|
||||
"--voice-format",
|
||||
metavar="VOICE_FORMAT",
|
||||
default="wav",
|
||||
choices=["wav", "mp3", "webm"],
|
||||
help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg",
|
||||
)
|
||||
group.add_argument(
|
||||
"--voice-language",
|
||||
metavar="VOICE_LANGUAGE",
|
||||
|
|
|
@ -997,7 +997,7 @@ class Commands:
|
|||
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
|
||||
return
|
||||
try:
|
||||
self.voice = voice.Voice()
|
||||
self.voice = voice.Voice(audio_format=self.args.voice_format)
|
||||
except voice.SoundDeviceError:
|
||||
self.io.tool_error(
|
||||
"Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"
|
||||
|
|
|
@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
|
|||
sf = None
|
||||
|
||||
from prompt_toolkit.shortcuts import prompt
|
||||
from pydub import AudioSegment
|
||||
|
||||
from .dump import dump # noqa: F401
|
||||
|
||||
|
@ -27,7 +28,7 @@ class Voice:
|
|||
|
||||
threshold = 0.15
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, audio_format="wav"):
|
||||
if sf is None:
|
||||
raise SoundDeviceError
|
||||
try:
|
||||
|
@ -37,6 +38,9 @@ class Voice:
|
|||
self.sd = sd
|
||||
except (OSError, ModuleNotFoundError):
|
||||
raise SoundDeviceError
|
||||
if audio_format not in ["wav", "mp3", "webm"]:
|
||||
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||
self.audio_format = audio_format
|
||||
|
||||
def callback(self, indata, frames, time, status):
|
||||
"""This is called (from a separate thread) for each audio block."""
|
||||
|
@ -80,7 +84,7 @@ class Voice:
|
|||
def raw_record_and_transcribe(self, history, language):
|
||||
self.q = queue.Queue()
|
||||
|
||||
filename = tempfile.mktemp(suffix=".wav")
|
||||
temp_wav = tempfile.mktemp(suffix=".wav")
|
||||
|
||||
try:
|
||||
sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
|
||||
|
@ -99,10 +103,18 @@ class Voice:
|
|||
except self.sd.PortAudioError as err:
|
||||
raise SoundDeviceError(f"Error accessing audio input device: {err}")
|
||||
|
||||
with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
|
||||
with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
|
||||
while not self.q.empty():
|
||||
file.write(self.q.get())
|
||||
|
||||
if self.audio_format != "wav":
|
||||
filename = tempfile.mktemp(suffix=f".{self.audio_format}")
|
||||
audio = AudioSegment.from_wav(temp_wav)
|
||||
audio.export(filename, format=self.audio_format)
|
||||
os.remove(temp_wav)
|
||||
else:
|
||||
filename = temp_wav
|
||||
|
||||
with open(filename, "rb") as fh:
|
||||
try:
|
||||
transcript = litellm.transcription(
|
||||
|
@ -112,6 +124,9 @@ class Voice:
|
|||
print(f"Unable to transcribe {filename}: {err}")
|
||||
return
|
||||
|
||||
if self.audio_format != "wav":
|
||||
os.remove(filename)
|
||||
|
||||
text = transcript.text
|
||||
return text
|
||||
|
||||
|
|
|
@ -242,6 +242,9 @@
|
|||
## Use VI editing mode in the terminal (default: False)
|
||||
#vim: false
|
||||
|
||||
## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg
|
||||
#voice-format: wav
|
||||
|
||||
## Specify the language for voice using ISO 639-1 code (default: auto)
|
||||
#voice-language: en
|
||||
|
||||
|
|
|
@ -144,6 +144,8 @@ pydantic==2.9.2
|
|||
# openai
|
||||
pydantic-core==2.23.4
|
||||
# via pydantic
|
||||
pydub==0.25.1
|
||||
# via -r requirements/requirements.in
|
||||
pyflakes==3.2.0
|
||||
# via flake8
|
||||
pygments==2.18.0
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# pip-compile requirements.in --upgrade
|
||||
#
|
||||
|
||||
pydub
|
||||
configargparse
|
||||
GitPython
|
||||
jsonschema
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue