diff --git a/aider/gui.py b/aider/gui.py index 6c5b012dc..bbb2984c8 100755 --- a/aider/gui.py +++ b/aider/gui.py @@ -9,6 +9,7 @@ import streamlit as st from aider import urls from aider.coders import Coder from aider.dump import dump # noqa: F401 +from aider.gui_speech_to_text import SpeechToText from aider.io import InputOutput from aider.main import main as cli_main from aider.scrape import Scraper, has_playwright @@ -153,6 +154,7 @@ class GUI: # self.do_recommended_actions() self.do_add_to_chat() + self.do_speech_to_text() self.do_recent_msgs() self.do_clear_chat_history() # st.container(height=150, border=False) @@ -211,6 +213,14 @@ class GUI: with st.popover("Add a web page to the chat"): self.do_web() + def do_speech_to_text(self): + # Initialize the speech-to-text component if not already done + if not hasattr(self, "speech_to_text"): + self.speech_to_text = SpeechToText() + + # Render the speech-to-text component + self.speech_to_text.render() + def do_add_image(self): with st.popover("Add image"): st.markdown("Hello World 👋") diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js new file mode 100644 index 000000000..a8a6fb3e5 --- /dev/null +++ b/aider/gui_speech_to_text.js @@ -0,0 +1,196 @@ +(function() { + // Generate a unique ID for this component instance + const compId = 'st-speech-to-text-' + Math.random().toString(36).substring(2, 9); + + // Find the container element + const container = document.getElementById('speech-to-text-container'); + if (!container) { + console.error('Could not find speech-to-text-container'); + return; + } + + // Style the container + container.style.display = 'flex'; + container.style.alignItems = 'center'; + container.style.padding = '5px'; + container.style.justifyContent = 'space-between'; + + // Create LED indicator + const led = document.createElement('div'); + led.id = 'led-' + compId; + led.style.width = '12px'; + led.style.height = '12px'; + led.style.borderRadius = '50%'; + led.style.backgroundColor = 'gray'; + led.style.marginRight = '10px'; + + // Create button + const button = document.createElement('button'); + button.id = 'button-' + compId; + button.textContent = 'Voice Input'; + button.style.padding = '4px 8px'; + + // Create stop button (initially hidden) + const stopButton = document.createElement('button'); + stopButton.id = 'stop-button-' + compId; + stopButton.textContent = 'Stop'; + stopButton.style.padding = '4px 8px'; + stopButton.style.marginLeft = '5px'; + stopButton.style.display = 'none'; + + // Create checkbox and label container + const checkContainer = document.createElement('div'); + checkContainer.style.display = 'flex'; + checkContainer.style.alignItems = 'center'; + checkContainer.style.marginLeft = '10px'; + + // Create auto-transcribe checkbox + const autoTranscribe = document.createElement('input'); + autoTranscribe.id = 'auto-transcribe-' + compId; + autoTranscribe.type = 'checkbox'; + autoTranscribe.style.marginRight = '5px'; + + // Create label for checkbox + const label = document.createElement('label'); + label.htmlFor = autoTranscribe.id; + label.textContent = 'Auto Transcribe'; + label.style.fontSize = '14px'; + label.style.color = 'white'; + + // Assemble components + checkContainer.appendChild(autoTranscribe); + checkContainer.appendChild(label); + + // Add elements to container + container.appendChild(led); + container.appendChild(button); + container.appendChild(stopButton); + container.appendChild(checkContainer); + + // Check if browser supports the Web Speech API + if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) { + button.disabled = true; + button.textContent = 'Not supported'; + return; + } + + // Function to populate the chat input + function populateChatInput(text) { + const parentDoc = window.parent.document; + let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]'); + const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$')); + const syntheticEvent = { target: chatInput, currentTarget: chatInput, + preventDefault: () => {}, nativeEvent: new Event('input', { bubbles: true })}; + + if (!chatInput || !reactProps) { + if (!chatInput) + console.error("Could not find chat input textarea"); + if (!reactProps) + console.error("Error setting chat input value:", err); + return false; + } + + // Append to the existing value + chatInput.value = chatInput.value + ' ' + text; + // Call React's onChange handler + chatInput[reactProps].onChange(syntheticEvent); + return true; + } + + // Initialize speech recognition + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const recognition = new SpeechRecognition(); + let isListening = false; + + recognition.continuous = false; + recognition.interimResults = false; + // Use browser's language or fall back to 'en-US' + recognition.lang = navigator.language || 'en-US'; + console.log('Speech recognition language:', recognition.lang); + + // Setup button click handler + button.addEventListener('click', function() { + if (isListening) return; + + isListening = true; + + // Set initial LED color based on auto-transcribe mode + if (autoTranscribe.checked) { + led.style.backgroundColor = 'red'; // Red when waiting for voice + stopButton.style.display = 'inline-block'; + recognition.continuous = true; + } else { + led.style.backgroundColor = 'lime'; + } + + recognition.start(); + }); + + // Setup stop button click handler + stopButton.addEventListener('click', function() { + if (isListening) { + recognition.stop(); + stopButton.style.display = 'none'; + isListening = false; + } + }); + + // Handle speech detection + recognition.onspeechstart = function() { + console.log('Speech detected'); + if (autoTranscribe.checked) { + led.style.backgroundColor = 'lime'; // Lime green when voice is detected + } + }; + + // Handle speech end + recognition.onspeechend = function() { + console.log('Speech ended'); + if (autoTranscribe.checked && isListening) { + led.style.backgroundColor = 'red'; // Red when waiting for voice + } + }; + + // Combined event handler function for speech recognition events + function handleSpeechEvent(eventType, event) { + if (eventType === 'result') { + // Get the latest transcript + const resultIndex = event.resultIndex; + const transcript = event.results[resultIndex][0].transcript; + + // Try to populate the chat input directly + const success = populateChatInput(transcript); + if (!success) + console.error('populateChatInput failed'); + + // If not in auto-transcribe mode, reset the LED + if (!autoTranscribe.checked) { + led.style.backgroundColor = 'gray'; + } + // In auto-transcribe mode, we'll keep the LED color as is (lime while speaking) + // The LED will be set back to red in the speechend event + } + else if (eventType === 'error') { + console.error('Speech recognition error', event.error); + isListening = false; + stopButton.style.display = 'none'; + led.style.backgroundColor = 'gray'; + } + else if (eventType === 'end') { + // If auto transcribe is enabled and we're still supposed to be listening, + // restart recognition + if (autoTranscribe.checked && isListening) { + setTimeout(() => recognition.start(), 100); + } else { + isListening = false; + stopButton.style.display = 'none'; + led.style.backgroundColor = 'gray'; + } + } + } + + // Set up event handlers using the combined function + recognition.onresult = function(event) { handleSpeechEvent('result', event); }; + recognition.onerror = function(event) { handleSpeechEvent('error', event); }; + recognition.onend = function() { handleSpeechEvent('end'); }; +})(); diff --git a/aider/gui_speech_to_text.py b/aider/gui_speech_to_text.py new file mode 100644 index 000000000..b1f77642a --- /dev/null +++ b/aider/gui_speech_to_text.py @@ -0,0 +1,37 @@ +import base64 +import os + +import streamlit as st +import streamlit.components.v1 as components + + +class SpeechToText: + """Class to handle speech-to-text functionality in the GUI""" + + def render(self): + """Render the speech-to-text component with LED indicator""" + self._js_dir = os.path.dirname(__file__) + + # Create JS file path + js_path = os.path.join(self._js_dir, "gui_speech_to_text.js") + if not os.path.exists(js_path): + st.error(f"JavaScript file not found: {js_path}") + return + + # Read the JS file for data URL + with open(js_path, "r") as f: + js_content = f.read() + + # Create data URL for the JS file + js_b64 = base64.b64encode(js_content.encode("utf-8")).decode("utf-8") + js_data_url = f"data:text/javascript;base64,{js_b64}" + + # Create simple HTML component with a container for the JS to populate + components.html( + f""" +
+ + + """, + height=50, + )