feat: Option to compress audio files by ~90%

Add option to reduce bandwidth (and potentially latency) by converting
voice recordings (wav) into a compressed audio format (webm or mp3).

Default behaviour is unchanged.

> File uploads are currently limited to 25 MB and the following input file
> types are supported: mp3, mp4, mpeg, mpga, m4a, wav, and webm.
>
> - https://platform.openai.com/docs/guides/speech-to-text
This commit is contained in:
Mike Bailey 2024-09-22 18:19:22 +10:00
parent f3ad683d70
commit 1cc30a22f9
6 changed files with 32 additions and 4 deletions

View file

@ -485,6 +485,13 @@ def get_parser(default_config_files, git_root):
help="Use VI editing mode in the terminal (default: False)",
default=False,
)
group.add_argument(
"--voice-format",
metavar="VOICE_FORMAT",
default="wav",
choices=["wav", "mp3", "webm"],
help="Audio format for voice recording (default: wav). webm and mp3 require ffmpeg",
)
group.add_argument(
"--voice-language",
metavar="VOICE_LANGUAGE",

View file

@ -997,7 +997,7 @@ class Commands:
self.io.tool_error("To use /voice you must provide an OpenAI API key.")
return
try:
self.voice = voice.Voice()
self.voice = voice.Voice(audio_format=self.args.voice_format)
except voice.SoundDeviceError:
self.io.tool_error(
"Unable to import `sounddevice` and/or `soundfile`, is portaudio installed?"

View file

@ -12,6 +12,7 @@ except (OSError, ModuleNotFoundError):
sf = None
from prompt_toolkit.shortcuts import prompt
from pydub import AudioSegment
from .dump import dump # noqa: F401
@ -27,7 +28,7 @@ class Voice:
threshold = 0.15
def __init__(self):
def __init__(self, audio_format="wav"):
if sf is None:
raise SoundDeviceError
try:
@ -37,6 +38,9 @@ class Voice:
self.sd = sd
except (OSError, ModuleNotFoundError):
raise SoundDeviceError
if audio_format not in ["wav", "mp3", "webm"]:
raise ValueError(f"Unsupported audio format: {audio_format}")
self.audio_format = audio_format
def callback(self, indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
@ -80,7 +84,7 @@ class Voice:
def raw_record_and_transcribe(self, history, language):
self.q = queue.Queue()
filename = tempfile.mktemp(suffix=".wav")
temp_wav = tempfile.mktemp(suffix=".wav")
try:
sample_rate = int(self.sd.query_devices(None, "input")["default_samplerate"])
@ -99,10 +103,18 @@ class Voice:
except self.sd.PortAudioError as err:
raise SoundDeviceError(f"Error accessing audio input device: {err}")
with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
with sf.SoundFile(temp_wav, mode="x", samplerate=sample_rate, channels=1) as file:
while not self.q.empty():
file.write(self.q.get())
if self.audio_format != "wav":
filename = tempfile.mktemp(suffix=f".{self.audio_format}")
audio = AudioSegment.from_wav(temp_wav)
audio.export(filename, format=self.audio_format)
os.remove(temp_wav)
else:
filename = temp_wav
with open(filename, "rb") as fh:
try:
transcript = litellm.transcription(
@ -112,6 +124,9 @@ class Voice:
print(f"Unable to transcribe {filename}: {err}")
return
if self.audio_format != "wav":
os.remove(filename)
text = transcript.text
return text

View file

@ -242,6 +242,9 @@
## Use VI editing mode in the terminal (default: False)
#vim: false
## Specify the audio format for voice recording (default: wav). webm and mp3 require ffmpeg
#voice-format: wav
## Specify the language for voice using ISO 639-1 code (default: auto)
#voice-language: en

View file

@ -144,6 +144,8 @@ pydantic==2.9.2
# openai
pydantic-core==2.23.4
# via pydantic
pydub==0.25.1
# via -r requirements/requirements.in
pyflakes==3.2.0
# via flake8
pygments==2.18.0

View file

@ -2,6 +2,7 @@
# pip-compile requirements.in --upgrade
#
pydub
configargparse
GitPython
jsonschema