Submission : Speech transcription with the web audio API in the browser

2025-05-31 17:55:01 +00:00 · 2025-05-10 20:41:23 +10:00 · 2025-05-10 20:41:23 +10:00 · e2dc7f6332
commit e2dc7f6332
parent 3daf7d4df3
3 changed files with 203 additions and 0 deletions
--- a/aider/gui.py
+++ b/aider/gui.py
@ -9,6 +9,7 @@ import streamlit as st
 from aider import urls
 from aider.coders import Coder
 from aider.dump import dump  # noqa: F401
+from aider.gui_speech_to_text import SpeechToText
 from aider.io import InputOutput
 from aider.main import main as cli_main
 from aider.scrape import Scraper, has_playwright
@ -153,6 +154,7 @@ class GUI:

            # self.do_recommended_actions()
            self.do_add_to_chat()
+            self.do_speech_to_text()
            self.do_recent_msgs()
            self.do_clear_chat_history()
            # st.container(height=150, border=False)
@ -211,6 +213,14 @@ class GUI:
        with st.popover("Add a web page to the chat"):
            self.do_web()

+    def do_speech_to_text(self):
+        # Initialize the speech-to-text component if not already done
+        if not hasattr(self, "speech_to_text"):
+            self.speech_to_text = SpeechToText()
+
+        # Render the speech-to-text component
+        self.speech_to_text.render()
+
    def do_add_image(self):
        with st.popover("Add image"):
            st.markdown("Hello World 👋")
--- a/aider/gui_speech_to_text.js
+++ b/aider/gui_speech_to_text.js
@ -0,0 +1,156 @@
+(function() {
+    // Generate a unique ID for this component instance
+    const compId = 'st-speech-to-text-' + Math.random().toString(36).substring(2, 9);
+    
+    // Find the container element
+    const container = document.getElementById('speech-to-text-container');
+    if (!container) {
+        console.error('Could not find speech-to-text-container');
+        return;
+    }
+    
+    // Style the container
+    container.style.display = 'flex';
+    container.style.alignItems = 'center';
+    container.style.padding = '5px';
+    container.style.justifyContent = 'space-between';
+    
+    // Create LED indicator
+    const led = document.createElement('div');
+    led.id = 'led-' + compId;
+    led.style.width = '12px';
+    led.style.height = '12px';
+    led.style.borderRadius = '50%';
+    led.style.backgroundColor = 'gray';
+    led.style.marginRight = '10px';
+    
+    // Create button
+    const button = document.createElement('button');
+    button.id = 'button-' + compId;
+    button.textContent = 'Voice Input';
+    button.style.padding = '4px 8px';
+    
+    // Add elements to container
+    container.appendChild(led);
+    container.appendChild(button);
+    
+    // Check if browser supports the Web Speech API
+    if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) {
+        button.disabled = true;
+        button.textContent = 'Not supported';
+        return;
+    }
+    
+    // Function to populate the chat input
+    function populateChatInput(text) {
+        const parentDoc = window.parent.document;
+        let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]');
+        
+        if (!chatInput) {
+            console.error("Could not find chat input textarea");
+            return false;
+        }
+        
+        try {
+            // Try to access React internals
+            const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$'));
+            
+            if (reactProps && chatInput[reactProps] && chatInput[reactProps].onChange) {
+                // If we can access React props, use the onChange handler directly
+                
+                // Append to the existing value
+                chatInput.value = chatInput.value + ' ' + text;
+                
+                // Create a synthetic event that React's onChange will accept
+                const syntheticEvent = {
+                    target: chatInput,
+                    currentTarget: chatInput,
+                    preventDefault: () => {},
+                    stopPropagation: () => {},
+                    persist: () => {},
+                    isDefaultPrevented: () => false,
+                    isPropagationStopped: () => false,
+                    bubbles: true,
+                    cancelable: true,
+                    nativeEvent: new Event('input', { bubbles: true })
+                };
+                
+                // Call React's onChange handler
+                chatInput[reactProps].onChange(syntheticEvent);
+            } else {
+                // Fallback to standard DOM approach with multiple event types
+                
+                // Focus first
+                chatInput.focus();
+                
+                // Append to the existing value
+                chatInput.value = chatInput.value + text;
+                
+                // Dispatch multiple event types to ensure detection
+                ['input', 'change', 'blur', 'keydown', 'keyup'].forEach(eventType => {
+                    const event = new Event(eventType, { bubbles: true, cancelable: true });
+                    chatInput.dispatchEvent(event);
+                });
+                
+                // For Streamlit specifically, try to trigger any MutationObservers or polling
+                // that might be watching for changes
+                setTimeout(() => {
+                    chatInput.dispatchEvent(new Event('change', { bubbles: true }));
+                    chatInput.dispatchEvent(new Event('blur', { bubbles: true }));
+                }, 100);
+            }
+            
+            return true;
+        } catch (err) {
+            console.error("Error setting chat input value:", err);
+            return false;
+        }
+    }
+
+    // Initialize speech recognition
+    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+    const recognition = new SpeechRecognition();
+    
+    recognition.continuous = false;
+    recognition.interimResults = false;
+    recognition.lang = 'en-US';
+    
+    // Setup button click handler
+    button.addEventListener('click', function() {
+        led.style.backgroundColor = 'green';
+        recognition.start();
+    });
+    
+    recognition.onresult = function(event) {
+        const transcript = event.results[0][0].transcript;
+        led.style.backgroundColor = 'gray';
+        
+        // Try to populate the chat input directly
+        const success = populateChatInput(transcript);
+        
+        if (!success) {
+            console.error('populateChatInput failed');
+            
+            // Add a debug display with the transcribed text
+            const debugContainer = document.createElement('div');
+            debugContainer.style.padding = '10px';
+            debugContainer.style.marginTop = '10px';
+            debugContainer.style.backgroundColor = '#ffeeee';
+            debugContainer.style.border = '1px solid #ffcccc';
+            debugContainer.innerHTML = `
+                <div><strong>Debug: Transcribed text</strong></div>
+                <div><textarea style="width: 100%; height: 80px;">${transcript}</textarea></div>
+            `;
+            container.appendChild(debugContainer);
+        }
+    };
+    
+    recognition.onerror = function(event) {
+        console.error('Speech recognition error', event.error);
+        led.style.backgroundColor = 'gray';
+    };
+    
+    recognition.onend = function() {
+        led.style.backgroundColor = 'gray';
+    };
+})();
--- a/aider/gui_speech_to_text.py
+++ b/aider/gui_speech_to_text.py
@ -0,0 +1,37 @@
+import base64
+import os
+
+import streamlit as st
+import streamlit.components.v1 as components
+
+
+class SpeechToText:
+    """Class to handle speech-to-text functionality in the GUI"""
+
+    def render(self):
+        """Render the speech-to-text component with LED indicator"""
+        self._js_dir = os.path.dirname(__file__)
+
+        # Create JS file path
+        js_path = os.path.join(self._js_dir, "gui_speech_to_text.js")
+        if not os.path.exists(js_path):
+            st.error(f"JavaScript file not found: {js_path}")
+            return
+
+        # Read the JS file for data URL
+        with open(js_path, "r") as f:
+            js_content = f.read()
+
+        # Create data URL for the JS file
+        js_b64 = base64.b64encode(js_content.encode("utf-8")).decode("utf-8")
+        js_data_url = f"data:text/javascript;base64,{js_b64}"
+
+        # Create simple HTML component with a container for the JS to populate
+        components.html(
+            f"""
+            <div id="speech-to-text-container"></div>
+            <!-- Load JS file via data URL since direct src paths don't work in Streamlit iframe -->
+            <script src="{js_data_url}"></script>
+            """,
+            height=50,
+        )