feat: Add OpenAI TTS audio generation and playback for recordings

2025-05-28 00:05:01 +00:00 · 2025-03-14 18:47:06 -07:00 · 2025-03-14 18:47:06 -07:00 · 278f748c1c
commit 278f748c1c
parent 874df40303
3 changed files with 173 additions and 20 deletions
--- a/aider/website/_includes/recording.js
+++ b/aider/website/_includes/recording.js
@ -42,7 +42,7 @@ document.addEventListener('DOMContentLoaded', function() {
                // Also trigger toast and speech
                showToast(message);
-                speakText(message);
+                speakText(message, timeInSeconds);
                // Highlight this timestamp
                highlightTimestamp(timeInSeconds);
@ -70,7 +70,7 @@ document.addEventListener('DOMContentLoaded', function() {
                  // Also trigger toast and speech
                  showToast(message);
-                  speakText(message);
+                  speakText(message, timeInSeconds);
                  // Highlight this timestamp
                  highlightTimestamp(timeInSeconds);
@ -180,23 +180,40 @@ document.addEventListener('DOMContentLoaded', function() {
    }, 3000);
  }
-  // Function to speak text using the Web Speech API
+  // Function to play pre-generated TTS audio files
-  function speakText(text) {
+  function speakText(text, timeInSeconds) {
-    // Check if speech synthesis is supported
+    // Format time for filename (MM-SS)
-    if ('speechSynthesis' in window) {
+    const minutes = Math.floor(timeInSeconds / 60);
-      // Create a new speech synthesis utterance
+    const seconds = timeInSeconds % 60;
-      const utterance = new SpeechSynthesisUtterance(text);
+    const formattedTime = `${minutes.toString().padStart(2, '0')}-${seconds.toString().padStart(2, '0')}`;
-      
+    
-      // Optional: Configure voice properties
+    // Get recording_id from the page or use default from the URL
-      utterance.rate = 1.0; // Speech rate (0.1 to 10)
+    const recordingId = typeof recording_id !== 'undefined' ? recording_id : 
-      utterance.pitch = 1.0; // Speech pitch (0 to 2)
+                       window.location.pathname.split('/').pop().replace('.html', '');
-      utterance.volume = 1.0; // Speech volume (0 to 1)
+                       
-      
+    // Construct audio file path
-      // Speak the text
+    const audioPath = `/assets/audio/${recordingId}/${formattedTime}.mp3`;
-      window.speechSynthesis.speak(utterance);
+    
-    } else {
+    // Create and play audio
-      console.warn('Speech synthesis not supported in this browser');
+    const audio = new Audio(audioPath);
-    }
+    
    // Error handling with fallback to browser TTS
    audio.onerror = () => {
      console.warn(`Failed to load audio: ${audioPath}`);
      // Fallback to browser TTS
      if ('speechSynthesis' in window) {
        const utterance = new SpeechSynthesisUtterance(text);
        utterance.rate = 1.0;
        utterance.pitch = 1.0;
        utterance.volume = 1.0;
        window.speechSynthesis.speak(utterance);
      }
    };
    // Play the audio
    audio.play().catch(e => {
      console.warn(`Error playing audio: ${e.message}`);
    });
  }
  // Function to highlight the active timestamp in the transcript
@ -243,7 +260,7 @@ document.addEventListener('DOMContentLoaded', function() {
        console.log(`marker! ${index} - ${time} - ${label}`);
        // Speak the marker label and show toast
-        speakText(label);
+        speakText(label, time);
        showToast(label);
        // Highlight the corresponding timestamp in the transcript
--- a/aider/website/docs/recordings/dont-drop-original-read-files.md
+++ b/aider/website/docs/recordings/dont-drop-original-read-files.md
@ -7,6 +7,7 @@ layout: minimal
 # Don't /drop read-only files added at launch
 <script>
 const recording_id = "dont-drop-original-read-files";
 const recording_url = "https://gist.githubusercontent.com/paul-gauthier/c2e7b2751925fb7bb47036cdd37ec40d/raw/08e62ab539e2b5d4b52c15c31d9a0d241377c17c/707583.cast";
 </script>
--- a/scripts/recording_audio.py
+++ b/scripts/recording_audio.py
@ -0,0 +1,135 @@
 #!/usr/bin/env python3
 """
 Generate TTS audio files for recording commentary using OpenAI's API.
 Usage: python scripts/recording_audio.py path/to/recording.md
 """
 import os
 import re
 import sys
 import argparse
 import requests
 from pathlib import Path
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 # Configuration
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 OUTPUT_DIR = "aider/website/assets/audio"
 VOICE = "onyx"  # Options: alloy, echo, fable, onyx, nova, shimmer
 def extract_recording_id(markdown_file):
    """Extract recording ID from the markdown file path."""
    return Path(markdown_file).stem
 def extract_commentary(markdown_file):
    """Extract commentary markers from markdown file."""
    with open(markdown_file, 'r') as f:
        content = f.read()
    # Find Commentary section
    commentary_match = re.search(r'## Commentary\s+(.*?)(?=##|\Z)', content, re.DOTALL)
    if not commentary_match:
        print(f"No Commentary section found in {markdown_file}")
        return []
    commentary = commentary_match.group(1).strip()
    # Extract timestamp-message pairs
    markers = []
    for line in commentary.split('\n'):
        line = line.strip()
        if line.startswith('- '):
            line = line[2:]  # Remove the list marker
            match = re.match(r'(\d+):(\d+)\s+(.*)', line)
            if match:
                minutes, seconds, message = match.groups()
                time_in_seconds = int(minutes) * 60 + int(seconds)
                markers.append((time_in_seconds, message))
    return markers
 def generate_audio_openai(text, output_file):
    """Generate audio using OpenAI TTS API."""
    if not OPENAI_API_KEY:
        print("Error: OPENAI_API_KEY environment variable not set")
        return False
    url = "https://api.openai.com/v1/audio/speech"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "tts-1",
        "input": text,
        "voice": VOICE
    }
    try:
        response = requests.post(url, headers=headers, json=data)
        if response.status_code == 200:
            with open(output_file, 'wb') as f:
                f.write(response.content)
            return True
        else:
            print(f"Error: {response.status_code}, {response.text}")
            return False
    except Exception as e:
        print(f"Exception during API call: {e}")
        return False
 def main():
    parser = argparse.ArgumentParser(description='Generate TTS audio for recording commentary.')
    parser.add_argument('markdown_file', help='Path to the recording markdown file')
    parser.add_argument('--voice', default=VOICE, help=f'OpenAI voice to use (default: {VOICE})')
    parser.add_argument('--output-dir', default=OUTPUT_DIR, help=f'Output directory (default: {OUTPUT_DIR})')
    parser.add_argument('--dry-run', action='store_true', help='Print what would be done without generating audio')
    args = parser.parse_args()
    # Update globals with any command line overrides
    global VOICE
    VOICE = args.voice
    recording_id = extract_recording_id(args.markdown_file)
    print(f"Processing recording: {recording_id}")
    # Create output directory
    output_dir = os.path.join(args.output_dir, recording_id)
    if not args.dry_run:
        os.makedirs(output_dir, exist_ok=True)
    # Extract commentary markers
    markers = extract_commentary(args.markdown_file)
    if not markers:
        print("No commentary markers found!")
        return
    print(f"Found {len(markers)} commentary markers")
    # Generate audio for each marker
    for time_sec, message in markers:
        minutes = time_sec // 60
        seconds = time_sec % 60
        timestamp = f"{minutes:02d}-{seconds:02d}"
        filename = f"{timestamp}.mp3"
        output_file = os.path.join(output_dir, filename)
        print(f"Marker at {minutes}:{seconds:02d} - {message}")
        if args.dry_run:
            print(f"  Would generate: {output_file}")
        else:
            print(f"  Generating: {output_file}")
            success = generate_audio_openai(message, output_file)
            if success:
                print(f"  ✓ Generated audio file")
            else:
                print(f"  ✗ Failed to generate audio")
 if __name__ == "__main__":
    main()