From 278f748c1c58361f51229d0e83de6364c76839fa Mon Sep 17 00:00:00 2001 From: "Paul Gauthier (aider)" Date: Fri, 14 Mar 2025 18:47:06 -0700 Subject: [PATCH] feat: Add OpenAI TTS audio generation and playback for recordings --- aider/website/_includes/recording.js | 57 +++++--- .../dont-drop-original-read-files.md | 1 + scripts/recording_audio.py | 135 ++++++++++++++++++ 3 files changed, 173 insertions(+), 20 deletions(-) diff --git a/aider/website/_includes/recording.js b/aider/website/_includes/recording.js index 85d044e97..d6ee290f9 100644 --- a/aider/website/_includes/recording.js +++ b/aider/website/_includes/recording.js @@ -42,7 +42,7 @@ document.addEventListener('DOMContentLoaded', function() { // Also trigger toast and speech showToast(message); - speakText(message); + speakText(message, timeInSeconds); // Highlight this timestamp highlightTimestamp(timeInSeconds); @@ -70,7 +70,7 @@ document.addEventListener('DOMContentLoaded', function() { // Also trigger toast and speech showToast(message); - speakText(message); + speakText(message, timeInSeconds); // Highlight this timestamp highlightTimestamp(timeInSeconds); @@ -180,23 +180,40 @@ document.addEventListener('DOMContentLoaded', function() { }, 3000); } - // Function to speak text using the Web Speech API - function speakText(text) { - // Check if speech synthesis is supported - if ('speechSynthesis' in window) { - // Create a new speech synthesis utterance - const utterance = new SpeechSynthesisUtterance(text); - - // Optional: Configure voice properties - utterance.rate = 1.0; // Speech rate (0.1 to 10) - utterance.pitch = 1.0; // Speech pitch (0 to 2) - utterance.volume = 1.0; // Speech volume (0 to 1) - - // Speak the text - window.speechSynthesis.speak(utterance); - } else { - console.warn('Speech synthesis not supported in this browser'); - } + // Function to play pre-generated TTS audio files + function speakText(text, timeInSeconds) { + // Format time for filename (MM-SS) + const minutes = Math.floor(timeInSeconds / 60); + const seconds = timeInSeconds % 60; + const formattedTime = `${minutes.toString().padStart(2, '0')}-${seconds.toString().padStart(2, '0')}`; + + // Get recording_id from the page or use default from the URL + const recordingId = typeof recording_id !== 'undefined' ? recording_id : + window.location.pathname.split('/').pop().replace('.html', ''); + + // Construct audio file path + const audioPath = `/assets/audio/${recordingId}/${formattedTime}.mp3`; + + // Create and play audio + const audio = new Audio(audioPath); + + // Error handling with fallback to browser TTS + audio.onerror = () => { + console.warn(`Failed to load audio: ${audioPath}`); + // Fallback to browser TTS + if ('speechSynthesis' in window) { + const utterance = new SpeechSynthesisUtterance(text); + utterance.rate = 1.0; + utterance.pitch = 1.0; + utterance.volume = 1.0; + window.speechSynthesis.speak(utterance); + } + }; + + // Play the audio + audio.play().catch(e => { + console.warn(`Error playing audio: ${e.message}`); + }); } // Function to highlight the active timestamp in the transcript @@ -243,7 +260,7 @@ document.addEventListener('DOMContentLoaded', function() { console.log(`marker! ${index} - ${time} - ${label}`); // Speak the marker label and show toast - speakText(label); + speakText(label, time); showToast(label); // Highlight the corresponding timestamp in the transcript diff --git a/aider/website/docs/recordings/dont-drop-original-read-files.md b/aider/website/docs/recordings/dont-drop-original-read-files.md index c107b7f06..82351687b 100644 --- a/aider/website/docs/recordings/dont-drop-original-read-files.md +++ b/aider/website/docs/recordings/dont-drop-original-read-files.md @@ -7,6 +7,7 @@ layout: minimal # Don't /drop read-only files added at launch diff --git a/scripts/recording_audio.py b/scripts/recording_audio.py index e69de29bb..1481273fc 100644 --- a/scripts/recording_audio.py +++ b/scripts/recording_audio.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Generate TTS audio files for recording commentary using OpenAI's API. +Usage: python scripts/recording_audio.py path/to/recording.md +""" + +import os +import re +import sys +import argparse +import requests +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Configuration +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OUTPUT_DIR = "aider/website/assets/audio" +VOICE = "onyx" # Options: alloy, echo, fable, onyx, nova, shimmer + +def extract_recording_id(markdown_file): + """Extract recording ID from the markdown file path.""" + return Path(markdown_file).stem + +def extract_commentary(markdown_file): + """Extract commentary markers from markdown file.""" + with open(markdown_file, 'r') as f: + content = f.read() + + # Find Commentary section + commentary_match = re.search(r'## Commentary\s+(.*?)(?=##|\Z)', content, re.DOTALL) + if not commentary_match: + print(f"No Commentary section found in {markdown_file}") + return [] + + commentary = commentary_match.group(1).strip() + + # Extract timestamp-message pairs + markers = [] + for line in commentary.split('\n'): + line = line.strip() + if line.startswith('- '): + line = line[2:] # Remove the list marker + match = re.match(r'(\d+):(\d+)\s+(.*)', line) + if match: + minutes, seconds, message = match.groups() + time_in_seconds = int(minutes) * 60 + int(seconds) + markers.append((time_in_seconds, message)) + + return markers + +def generate_audio_openai(text, output_file): + """Generate audio using OpenAI TTS API.""" + if not OPENAI_API_KEY: + print("Error: OPENAI_API_KEY environment variable not set") + return False + + url = "https://api.openai.com/v1/audio/speech" + headers = { + "Authorization": f"Bearer {OPENAI_API_KEY}", + "Content-Type": "application/json" + } + data = { + "model": "tts-1", + "input": text, + "voice": VOICE + } + + try: + response = requests.post(url, headers=headers, json=data) + + if response.status_code == 200: + with open(output_file, 'wb') as f: + f.write(response.content) + return True + else: + print(f"Error: {response.status_code}, {response.text}") + return False + except Exception as e: + print(f"Exception during API call: {e}") + return False + +def main(): + parser = argparse.ArgumentParser(description='Generate TTS audio for recording commentary.') + parser.add_argument('markdown_file', help='Path to the recording markdown file') + parser.add_argument('--voice', default=VOICE, help=f'OpenAI voice to use (default: {VOICE})') + parser.add_argument('--output-dir', default=OUTPUT_DIR, help=f'Output directory (default: {OUTPUT_DIR})') + parser.add_argument('--dry-run', action='store_true', help='Print what would be done without generating audio') + + args = parser.parse_args() + + # Update globals with any command line overrides + global VOICE + VOICE = args.voice + + recording_id = extract_recording_id(args.markdown_file) + print(f"Processing recording: {recording_id}") + + # Create output directory + output_dir = os.path.join(args.output_dir, recording_id) + if not args.dry_run: + os.makedirs(output_dir, exist_ok=True) + + # Extract commentary markers + markers = extract_commentary(args.markdown_file) + + if not markers: + print("No commentary markers found!") + return + + print(f"Found {len(markers)} commentary markers") + + # Generate audio for each marker + for time_sec, message in markers: + minutes = time_sec // 60 + seconds = time_sec % 60 + timestamp = f"{minutes:02d}-{seconds:02d}" + filename = f"{timestamp}.mp3" + output_file = os.path.join(output_dir, filename) + + print(f"Marker at {minutes}:{seconds:02d} - {message}") + if args.dry_run: + print(f" Would generate: {output_file}") + else: + print(f" Generating: {output_file}") + success = generate_audio_openai(message, output_file) + if success: + print(f" ✓ Generated audio file") + else: + print(f" ✗ Failed to generate audio") + +if __name__ == "__main__": + main()