mirror of
https://github.com/Aider-AI/aider.git
synced 2025-06-04 11:45:00 +00:00
feat: Option to compress audio files by ~90%
Add option to reduce bandwidth (and potentially latency) by converting voice recordings (wav) into a compressed audio format (webm or mp3). Default behaviour is unchanged. > File uploads are currently limited to 25 MB and the following input file > types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm. > > - https://platform.openai.com/docs/guides/speech-to-text
This commit is contained in:
parent
f3ad683d70
commit
1cc30a22f9
6 changed files with 32 additions and 4 deletions
|
@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
|
|||
sf = None
|
||||
|
||||
from prompt_toolkit.shortcuts import prompt
|
||||
from pydub import AudioSegment
|
||||
|
||||
from .dump import dump # noqa: F401
|
||||
|
||||
|
@ -27,7 +28,7 @@ class Voice:
|
|||
|
||||
threshold = 0.15
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, audio_format="wav"):
|
||||
if sf is None:
|
||||
raise SoundDeviceError
|
||||
try:
|
||||
|
@ -37,6 +38,9 @@ class Voice:
|
|||
self.sd = sd
|
||||
except (OSError, ModuleNotFoundError):
|
||||
raise SoundDeviceError
|
||||
if audio_format not in ["wav", "mp3", "webm"]:
|
||||
raise ValueError(f"Unsupported audio format: {audio_format}")
|
||||
self.audio_format = audio_format
|
||||
|
||||
def callback(self, indata, frames, time, status):
|
||||
"""This is called (from a separate thread) for each audio block."""
|
||||
|
@ -80,7 +84,7 @@ class Voice:
|
|||
def raw_record_and_transcribe(self, history, language):
|
||||
self.q = queue.Queue()
|
||||
|
||||
filename = tempfile.mktemp(suffix=".wav")
|
||||
temp_wav = tempfile.mktemp(suffix=".wav")
|
||||
|
||||
try:
|
||||
sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
|
||||
|
@ -99,10 +103,18 @@ class Voice:
|
|||
except self.sd.PortAudioError as err:
|
||||
raise SoundDeviceError(f"Error accessing audio input device: {err}")
|
||||
|
||||
with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
|
||||
with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
|
||||
while not self.q.empty():
|
||||
file.write(self.q.get())
|
||||
|
||||
if self.audio_format != "wav":
|
||||
filename = tempfile.mktemp(suffix=f".{self.audio_format}")
|
||||
audio = AudioSegment.from_wav(temp_wav)
|
||||
audio.export(filename, format=self.audio_format)
|
||||
os.remove(temp_wav)
|
||||
else:
|
||||
filename = temp_wav
|
||||
|
||||
with open(filename, "rb") as fh:
|
||||
try:
|
||||
transcript = litellm.transcription(
|
||||
|
@ -112,6 +124,9 @@ class Voice:
|
|||
print(f"Unable to transcribe {filename}: {err}")
|
||||
return
|
||||
|
||||
if self.audio_format != "wav":
|
||||
os.remove(filename)
|
||||
|
||||
text = transcript.text
|
||||
return text
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue