From 053751cb4374dcc3a18dddc1b91908a66d80e7a6 Mon Sep 17 00:00:00 2001 From: Paul Gauthier Date: Thu, 10 Aug 2023 22:17:31 -0300 Subject: [PATCH] show loudness bar --- aider/commands.py | 6 +++-- aider/voice.py | 65 +++++++++++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 24 deletions(-) diff --git a/aider/commands.py b/aider/commands.py index 62bdc8c03..622d1fbdf 100644 --- a/aider/commands.py +++ b/aider/commands.py @@ -436,11 +436,13 @@ class Commands: def cmd_voice(self, args): "Record and transcribe voice input" - if not voice.is_audio_available(): + v = voice.Voice() + + if not v.is_audio_available(): self.io.tool_error("Unable to import `sounddevice`, is portaudio installed?") return - text = voice.record_and_transcribe() + text = v.record_and_transcribe() self.io.add_to_file_history(text) print() self.io.user_input(text, log_only=False) diff --git a/aider/voice.py b/aider/voice.py index e175dfeab..5f6086cf5 100644 --- a/aider/voice.py +++ b/aider/voice.py @@ -1,8 +1,11 @@ import os import queue import tempfile +import time +import numpy as np import openai +from prompt_toolkit.shortcuts import prompt try: import sounddevice as sd @@ -14,41 +17,59 @@ import soundfile as sf from .dump import dump # noqa: F401 -def is_audio_available(): - return sd is not None +class Voice: + max_rms = 0 + min_rms = 1e5 + pct = 0 + def is_audio_available(self): + return sd is not None -def record_and_transcribe(): - q = queue.Queue() - - import numpy as np - - def callback(indata, frames, time, status): + def callback(self, indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" - q.put(indata.copy()) + self.q.put(indata.copy()) rms = np.sqrt(np.mean(indata**2)) - dump(rms) + self.max_rms = max(self.max_rms, rms) + self.min_rms = min(self.min_rms, rms) - filename = tempfile.mktemp(suffix=".wav") + rng = self.max_rms - self.min_rms + if rng > 0.001: + self.pct = (rms - self.min_rms) / rng - sample_rate = 16000 # 16kHz + def get_prompt(self): + if np.isnan(self.pct): + bar = "" + else: + bar = "█" * int(self.pct * 10) - with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file: - with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback): - input("Recording... Press ENTER when done speaking...") + dur = time.time() - self.start_time + return f"Recording, press ENTER when done... {dur:.1f}sec {bar}" - while not q.empty(): - file.write(q.get()) + def record_and_transcribe(self): + self.q = queue.Queue() - with open(filename, "rb") as fh: - transcript = openai.Audio.transcribe("whisper-1", fh) + filename = tempfile.mktemp(suffix=".wav") - text = transcript["text"] - return text + sample_rate = 16000 # 16kHz + + self.start_time = time.time() + + with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file: + with sd.InputStream(samplerate=sample_rate, channels=1, callback=self.callback): + prompt(self.get_prompt, refresh_interval=0.1) + + while not self.q.empty(): + file.write(self.q.get()) + + with open(filename, "rb") as fh: + transcript = openai.Audio.transcribe("whisper-1", fh) + + text = transcript["text"] + return text if __name__ == "__main__": api_key = os.getenv("OPENAI_API_KEY") if not api_key: raise ValueError("Please set the OPENAI_API_KEY environment variable.") - print(record_and_transcribe()) + print(Voice().record_and_transcribe())