Merge adeb0dd4dd into 3caab85931

2025-05-25 14:55:00 +00:00 · 2025-05-14 05:14:33 +00:00 · 2025-05-14 05:14:33 +00:00 · 24d33d5b94
commit 24d33d5b94
parent 3caab85931 adeb0dd4dd
3 changed files with 243 additions and 0 deletions
--- a/aider/gui.py
+++ b/aider/gui.py
@ -9,6 +9,7 @@ import streamlit as st
 from aider import urls
 from aider.coders import Coder
 from aider.dump import dump  # noqa: F401
 from aider.gui_speech_to_text import SpeechToText
 from aider.io import InputOutput
 from aider.main import main as cli_main
 from aider.scrape import Scraper, has_playwright
@ -153,6 +154,7 @@ class GUI:
            # self.do_recommended_actions()
            self.do_add_to_chat()
            self.do_speech_to_text()
            self.do_recent_msgs()
            self.do_clear_chat_history()
            # st.container(height=150, border=False)
@ -211,6 +213,14 @@ class GUI:
        with st.popover("Add a web page to the chat"):
            self.do_web()
    def do_speech_to_text(self):
        # Initialize the speech-to-text component if not already done
        if not hasattr(self, "speech_to_text"):
            self.speech_to_text = SpeechToText()
        # Render the speech-to-text component
        self.speech_to_text.render()
    def do_add_image(self):
        with st.popover("Add image"):
            st.markdown("Hello World 👋")
--- a/aider/gui_speech_to_text.js
+++ b/aider/gui_speech_to_text.js
@ -0,0 +1,196 @@
 (function() {
    // Generate a unique ID for this component instance
    const compId = 'st-speech-to-text-' + Math.random().toString(36).substring(2, 9);
    // Find the container element
    const container = document.getElementById('speech-to-text-container');
    if (!container) {
        console.error('Could not find speech-to-text-container');
        return;
    }
    // Style the container
    container.style.display = 'flex';
    container.style.alignItems = 'center';
    container.style.padding = '5px';
    container.style.justifyContent = 'space-between';
    // Create LED indicator
    const led = document.createElement('div');
    led.id = 'led-' + compId;
    led.style.width = '12px';
    led.style.height = '12px';
    led.style.borderRadius = '50%';
    led.style.backgroundColor = 'gray';
    led.style.marginRight = '10px';
    // Create button
    const button = document.createElement('button');
    button.id = 'button-' + compId;
    button.textContent = 'Voice Input';
    button.style.padding = '4px 8px';
    // Create stop button (initially hidden)
    const stopButton = document.createElement('button');
    stopButton.id = 'stop-button-' + compId;
    stopButton.textContent = 'Stop';
    stopButton.style.padding = '4px 8px';
    stopButton.style.marginLeft = '5px';
    stopButton.style.display = 'none';
    // Create checkbox and label container
    const checkContainer = document.createElement('div');
    checkContainer.style.display = 'flex';
    checkContainer.style.alignItems = 'center';
    checkContainer.style.marginLeft = '10px';
    // Create auto-transcribe checkbox
    const autoTranscribe = document.createElement('input');
    autoTranscribe.id = 'auto-transcribe-' + compId;
    autoTranscribe.type = 'checkbox';
    autoTranscribe.style.marginRight = '5px';
    // Create label for checkbox
    const label = document.createElement('label');
    label.htmlFor = autoTranscribe.id;
    label.textContent = 'Auto Transcribe';
    label.style.fontSize = '14px';
    label.style.color = 'white';
    // Assemble components
    checkContainer.appendChild(autoTranscribe);
    checkContainer.appendChild(label);
    // Add elements to container
    container.appendChild(led);
    container.appendChild(button);
    container.appendChild(stopButton);
    container.appendChild(checkContainer);
    // Check if browser supports the Web Speech API
    if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) {
        button.disabled = true;
        button.textContent = 'Not supported';
        return;
    }
    // Function to populate the chat input
    function populateChatInput(text) {
        const parentDoc = window.parent.document;
        let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]');
        const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$'));
        const syntheticEvent = { target: chatInput, currentTarget: chatInput,
            preventDefault: () => {}, nativeEvent: new Event('input', { bubbles: true })};
        if (!chatInput || !reactProps) {
            if (!chatInput)
                console.error("Could not find chat input textarea");
            if (!reactProps)
                console.error("Error setting chat input value:", err);
            return false;
        }
        // Append to the existing value
        chatInput.value = chatInput.value + ' ' + text;
        // Call React's onChange handler
        chatInput[reactProps].onChange(syntheticEvent);
        return true;
    }
    // Initialize speech recognition
    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
    const recognition = new SpeechRecognition();
    let isListening = false;
    recognition.continuous = false;
    recognition.interimResults = false;
    // Use browser's language or fall back to 'en-US'
    recognition.lang = navigator.language || 'en-US';
    console.log('Speech recognition language:', recognition.lang);
    // Setup button click handler
    button.addEventListener('click', function() {
        if (isListening) return;
        isListening = true;
        // Set initial LED color based on auto-transcribe mode
        if (autoTranscribe.checked) {
            led.style.backgroundColor = 'red'; // Red when waiting for voice
            stopButton.style.display = 'inline-block';
            recognition.continuous = true;
        } else {
            led.style.backgroundColor = 'lime';
        }
        recognition.start();
    });
    // Setup stop button click handler
    stopButton.addEventListener('click', function() {
        if (isListening) {
            recognition.stop();
            stopButton.style.display = 'none';
            isListening = false;
        }
    });
    // Handle speech detection
    recognition.onspeechstart = function() {
        console.log('Speech detected');
        if (autoTranscribe.checked) {
            led.style.backgroundColor = 'lime'; // Lime green when voice is detected
        }
    };
    // Handle speech end
    recognition.onspeechend = function() {
        console.log('Speech ended');
        if (autoTranscribe.checked && isListening) {
            led.style.backgroundColor = 'red'; // Red when waiting for voice
        }
    };
    // Combined event handler function for speech recognition events
    function handleSpeechEvent(eventType, event) {
        if (eventType === 'result') {
            // Get the latest transcript
            const resultIndex = event.resultIndex;
            const transcript = event.results[resultIndex][0].transcript;
            // Try to populate the chat input directly
            const success = populateChatInput(transcript);
            if (!success)
                console.error('populateChatInput failed');
            // If not in auto-transcribe mode, reset the LED
            if (!autoTranscribe.checked) {
                led.style.backgroundColor = 'gray';
            }
            // In auto-transcribe mode, we'll keep the LED color as is (lime while speaking)
            // The LED will be set back to red in the speechend event
        } 
        else if (eventType === 'error') {
            console.error('Speech recognition error', event.error);
            isListening = false;
            stopButton.style.display = 'none';
            led.style.backgroundColor = 'gray';
        }
        else if (eventType === 'end') {
            // If auto transcribe is enabled and we're still supposed to be listening,
            // restart recognition
            if (autoTranscribe.checked && isListening) {
                setTimeout(() => recognition.start(), 100);
            } else {
                isListening = false;
                stopButton.style.display = 'none';
                led.style.backgroundColor = 'gray';
            }
        }
    }
    // Set up event handlers using the combined function
    recognition.onresult = function(event) { handleSpeechEvent('result', event); };
    recognition.onerror = function(event) { handleSpeechEvent('error', event); };
    recognition.onend = function() { handleSpeechEvent('end'); };
 })();
--- a/aider/gui_speech_to_text.py
+++ b/aider/gui_speech_to_text.py
@ -0,0 +1,37 @@
 import base64
 import os
 import streamlit as st
 import streamlit.components.v1 as components
 class SpeechToText:
    """Class to handle speech-to-text functionality in the GUI"""
    def render(self):
        """Render the speech-to-text component with LED indicator"""
        self._js_dir = os.path.dirname(__file__)
        # Create JS file path
        js_path = os.path.join(self._js_dir, "gui_speech_to_text.js")
        if not os.path.exists(js_path):
            st.error(f"JavaScript file not found: {js_path}")
            return
        # Read the JS file for data URL
        with open(js_path, "r") as f:
            js_content = f.read()
        # Create data URL for the JS file
        js_b64 = base64.b64encode(js_content.encode("utf-8")).decode("utf-8")
        js_data_url = f"data:text/javascript;base64,{js_b64}"
        # Create simple HTML component with a container for the JS to populate
        components.html(
            f"""
            <div id="speech-to-text-container"></div>
            <!-- Load JS file via data URL since direct src paths don't work in Streamlit iframe -->
            <script src="{js_data_url}"></script>
            """,
            height=50,
        )