From e2dc7f6332f01b507e57e4d37c579c6a2a068f0c Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sat, 10 May 2025 20:41:23 +1000 Subject: [PATCH 01/10] Submission : Speech transcription with the web audio API in the browser --- aider/gui.py | 10 +++ aider/gui_speech_to_text.js | 156 ++++++++++++++++++++++++++++++++++++ aider/gui_speech_to_text.py | 37 +++++++++ 3 files changed, 203 insertions(+) create mode 100644 aider/gui_speech_to_text.js create mode 100644 aider/gui_speech_to_text.py diff --git a/aider/gui.py b/aider/gui.py index 6c5b012dc..bbb2984c8 100755 --- a/aider/gui.py +++ b/aider/gui.py @@ -9,6 +9,7 @@ import streamlit as st from aider import urls from aider.coders import Coder from aider.dump import dump # noqa: F401 +from aider.gui_speech_to_text import SpeechToText from aider.io import InputOutput from aider.main import main as cli_main from aider.scrape import Scraper, has_playwright @@ -153,6 +154,7 @@ class GUI: # self.do_recommended_actions() self.do_add_to_chat() + self.do_speech_to_text() self.do_recent_msgs() self.do_clear_chat_history() # st.container(height=150, border=False) @@ -211,6 +213,14 @@ class GUI: with st.popover("Add a web page to the chat"): self.do_web() + def do_speech_to_text(self): + # Initialize the speech-to-text component if not already done + if not hasattr(self, "speech_to_text"): + self.speech_to_text = SpeechToText() + + # Render the speech-to-text component + self.speech_to_text.render() + def do_add_image(self): with st.popover("Add image"): st.markdown("Hello World 👋") diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js new file mode 100644 index 000000000..32b2331e4 --- /dev/null +++ b/aider/gui_speech_to_text.js @@ -0,0 +1,156 @@ +(function() { + // Generate a unique ID for this component instance + const compId = 'st-speech-to-text-' + Math.random().toString(36).substring(2, 9); + + // Find the container element + const container = document.getElementById('speech-to-text-container'); + if (!container) { + console.error('Could not find speech-to-text-container'); + return; + } + + // Style the container + container.style.display = 'flex'; + container.style.alignItems = 'center'; + container.style.padding = '5px'; + container.style.justifyContent = 'space-between'; + + // Create LED indicator + const led = document.createElement('div'); + led.id = 'led-' + compId; + led.style.width = '12px'; + led.style.height = '12px'; + led.style.borderRadius = '50%'; + led.style.backgroundColor = 'gray'; + led.style.marginRight = '10px'; + + // Create button + const button = document.createElement('button'); + button.id = 'button-' + compId; + button.textContent = 'Voice Input'; + button.style.padding = '4px 8px'; + + // Add elements to container + container.appendChild(led); + container.appendChild(button); + + // Check if browser supports the Web Speech API + if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) { + button.disabled = true; + button.textContent = 'Not supported'; + return; + } + + // Function to populate the chat input + function populateChatInput(text) { + const parentDoc = window.parent.document; + let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]'); + + if (!chatInput) { + console.error("Could not find chat input textarea"); + return false; + } + + try { + // Try to access React internals + const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$')); + + if (reactProps && chatInput[reactProps] && chatInput[reactProps].onChange) { + // If we can access React props, use the onChange handler directly + + // Append to the existing value + chatInput.value = chatInput.value + ' ' + text; + + // Create a synthetic event that React's onChange will accept + const syntheticEvent = { + target: chatInput, + currentTarget: chatInput, + preventDefault: () => {}, + stopPropagation: () => {}, + persist: () => {}, + isDefaultPrevented: () => false, + isPropagationStopped: () => false, + bubbles: true, + cancelable: true, + nativeEvent: new Event('input', { bubbles: true }) + }; + + // Call React's onChange handler + chatInput[reactProps].onChange(syntheticEvent); + } else { + // Fallback to standard DOM approach with multiple event types + + // Focus first + chatInput.focus(); + + // Append to the existing value + chatInput.value = chatInput.value + text; + + // Dispatch multiple event types to ensure detection + ['input', 'change', 'blur', 'keydown', 'keyup'].forEach(eventType => { + const event = new Event(eventType, { bubbles: true, cancelable: true }); + chatInput.dispatchEvent(event); + }); + + // For Streamlit specifically, try to trigger any MutationObservers or polling + // that might be watching for changes + setTimeout(() => { + chatInput.dispatchEvent(new Event('change', { bubbles: true })); + chatInput.dispatchEvent(new Event('blur', { bubbles: true })); + }, 100); + } + + return true; + } catch (err) { + console.error("Error setting chat input value:", err); + return false; + } + } + + // Initialize speech recognition + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const recognition = new SpeechRecognition(); + + recognition.continuous = false; + recognition.interimResults = false; + recognition.lang = 'en-US'; + + // Setup button click handler + button.addEventListener('click', function() { + led.style.backgroundColor = 'green'; + recognition.start(); + }); + + recognition.onresult = function(event) { + const transcript = event.results[0][0].transcript; + led.style.backgroundColor = 'gray'; + + // Try to populate the chat input directly + const success = populateChatInput(transcript); + + if (!success) { + console.error('populateChatInput failed'); + + // Add a debug display with the transcribed text + const debugContainer = document.createElement('div'); + debugContainer.style.padding = '10px'; + debugContainer.style.marginTop = '10px'; + debugContainer.style.backgroundColor = '#ffeeee'; + debugContainer.style.border = '1px solid #ffcccc'; + debugContainer.innerHTML = ` +
Debug: Transcribed text
+
+ `; + container.appendChild(debugContainer); + } + }; + + recognition.onerror = function(event) { + console.error('Speech recognition error', event.error); + led.style.backgroundColor = 'gray'; + }; + + recognition.onend = function() { + led.style.backgroundColor = 'gray'; + }; +})(); diff --git a/aider/gui_speech_to_text.py b/aider/gui_speech_to_text.py new file mode 100644 index 000000000..b1f77642a --- /dev/null +++ b/aider/gui_speech_to_text.py @@ -0,0 +1,37 @@ +import base64 +import os + +import streamlit as st +import streamlit.components.v1 as components + + +class SpeechToText: + """Class to handle speech-to-text functionality in the GUI""" + + def render(self): + """Render the speech-to-text component with LED indicator""" + self._js_dir = os.path.dirname(__file__) + + # Create JS file path + js_path = os.path.join(self._js_dir, "gui_speech_to_text.js") + if not os.path.exists(js_path): + st.error(f"JavaScript file not found: {js_path}") + return + + # Read the JS file for data URL + with open(js_path, "r") as f: + js_content = f.read() + + # Create data URL for the JS file + js_b64 = base64.b64encode(js_content.encode("utf-8")).decode("utf-8") + js_data_url = f"data:text/javascript;base64,{js_b64}" + + # Create simple HTML component with a container for the JS to populate + components.html( + f""" +
+ + + """, + height=50, + ) From 4379a74f9d739a531d92d3a54c3344846a638b14 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 19:45:19 +1000 Subject: [PATCH 02/10] SpeechToText js : use only reactProps --- aider/gui_speech_to_text.js | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 32b2331e4..009a0b993 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -52,12 +52,9 @@ } try { - // Try to access React internals const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$')); - if (reactProps && chatInput[reactProps] && chatInput[reactProps].onChange) { - // If we can access React props, use the onChange handler directly - + if (reactProps && chatInput[reactProps] && chatInput[reactProps].onChange) { // Append to the existing value chatInput.value = chatInput.value + ' ' + text; @@ -78,26 +75,8 @@ // Call React's onChange handler chatInput[reactProps].onChange(syntheticEvent); } else { - // Fallback to standard DOM approach with multiple event types - - // Focus first - chatInput.focus(); - - // Append to the existing value - chatInput.value = chatInput.value + text; - - // Dispatch multiple event types to ensure detection - ['input', 'change', 'blur', 'keydown', 'keyup'].forEach(eventType => { - const event = new Event(eventType, { bubbles: true, cancelable: true }); - chatInput.dispatchEvent(event); - }); - - // For Streamlit specifically, try to trigger any MutationObservers or polling - // that might be watching for changes - setTimeout(() => { - chatInput.dispatchEvent(new Event('change', { bubbles: true })); - chatInput.dispatchEvent(new Event('blur', { bubbles: true })); - }, 100); + console.error("Could not find React props on chat input"); + return false; } return true; From 641dc51699bdee02909b09cdbdd124a2cdb3be26 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 19:49:51 +1000 Subject: [PATCH 03/10] SpeechToText js simplify error feedback to console.error --- aider/gui_speech_to_text.js | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 009a0b993..2ef85aeb0 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -96,7 +96,7 @@ // Setup button click handler button.addEventListener('click', function() { - led.style.backgroundColor = 'green'; + led.style.backgroundColor = 'lime'; recognition.start(); }); @@ -106,22 +106,8 @@ // Try to populate the chat input directly const success = populateChatInput(transcript); - - if (!success) { + if (!success) console.error('populateChatInput failed'); - - // Add a debug display with the transcribed text - const debugContainer = document.createElement('div'); - debugContainer.style.padding = '10px'; - debugContainer.style.marginTop = '10px'; - debugContainer.style.backgroundColor = '#ffeeee'; - debugContainer.style.border = '1px solid #ffcccc'; - debugContainer.innerHTML = ` -
Debug: Transcribed text
-
- `; - container.appendChild(debugContainer); - } }; recognition.onerror = function(event) { From edb51373b2bf613217017f617a7952cc6d99aab5 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 19:56:46 +1000 Subject: [PATCH 04/10] SpeechToText js simplify the onChange event code --- aider/gui_speech_to_text.js | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 2ef85aeb0..91a9dd250 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -58,17 +58,11 @@ // Append to the existing value chatInput.value = chatInput.value + ' ' + text; - // Create a synthetic event that React's onChange will accept + // Create a simplified synthetic event with only essential properties const syntheticEvent = { target: chatInput, currentTarget: chatInput, preventDefault: () => {}, - stopPropagation: () => {}, - persist: () => {}, - isDefaultPrevented: () => false, - isPropagationStopped: () => false, - bubbles: true, - cancelable: true, nativeEvent: new Event('input', { bubbles: true }) }; From 323dadfe29a020bfc45feb347ab51b4c8cd03e69 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 20:06:56 +1000 Subject: [PATCH 05/10] SpeechToText js : merge stop event handlers --- aider/gui_speech_to_text.js | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 91a9dd250..436628de7 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -94,22 +94,27 @@ recognition.start(); }); - recognition.onresult = function(event) { - const transcript = event.results[0][0].transcript; + // Combined event handler function for speech recognition events + function handleSpeechEvent(eventType, event) { + // Set LED back to gray for all events led.style.backgroundColor = 'gray'; - // Try to populate the chat input directly - const success = populateChatInput(transcript); - if (!success) - console.error('populateChatInput failed'); - }; + if (eventType === 'result') { + const transcript = event.results[0][0].transcript; + + // Try to populate the chat input directly + const success = populateChatInput(transcript); + if (!success) + console.error('populateChatInput failed'); + } + else if (eventType === 'error') { + console.error('Speech recognition error', event.error); + } + // 'end' event requires no special handling beyond resetting the LED + } - recognition.onerror = function(event) { - console.error('Speech recognition error', event.error); - led.style.backgroundColor = 'gray'; - }; - - recognition.onend = function() { - led.style.backgroundColor = 'gray'; - }; + // Set up event handlers using the combined function + recognition.onresult = function(event) { handleSpeechEvent('result', event); }; + recognition.onerror = function(event) { handleSpeechEvent('error', event); }; + recognition.onend = function() { handleSpeechEvent('end'); }; })(); From f3bf5b6a6bd30b3e6a62af40e84caec386b4b259 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 20:07:16 +1000 Subject: [PATCH 06/10] SpeechToTest js : autodetect language --- aider/gui_speech_to_text.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 436628de7..afe8c1854 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -86,7 +86,9 @@ recognition.continuous = false; recognition.interimResults = false; - recognition.lang = 'en-US'; + // Use browser's language or fall back to 'en-US' + recognition.lang = navigator.language || 'en-US'; + console.log('Speech recognition language:', recognition.lang); // Setup button click handler button.addEventListener('click', function() { From 9006ffc3c19d0bf7081f3f4a574da4fe4fae0e08 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Sun, 11 May 2025 20:20:38 +1000 Subject: [PATCH 07/10] SpeechToText js : Simplify the populateChatInput function --- aider/gui_speech_to_text.js | 42 ++++++++++++------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index afe8c1854..8cd99d2d1 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -45,39 +45,23 @@ function populateChatInput(text) { const parentDoc = window.parent.document; let chatInput = parentDoc.querySelector('textarea[data-testid="stChatInputTextArea"]'); + const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$')); + const syntheticEvent = { target: chatInput, currentTarget: chatInput, + preventDefault: () => {}, nativeEvent: new Event('input', { bubbles: true })}; - if (!chatInput) { - console.error("Could not find chat input textarea"); + if (!chatInput || !reactProps) { + if (!chatInput) + console.error("Could not find chat input textarea"); + if (!reactProps) + console.error("Error setting chat input value:", err); return false; } - try { - const reactProps = Object.keys(chatInput).find(key => key.startsWith('__reactProps$')); - - if (reactProps && chatInput[reactProps] && chatInput[reactProps].onChange) { - // Append to the existing value - chatInput.value = chatInput.value + ' ' + text; - - // Create a simplified synthetic event with only essential properties - const syntheticEvent = { - target: chatInput, - currentTarget: chatInput, - preventDefault: () => {}, - nativeEvent: new Event('input', { bubbles: true }) - }; - - // Call React's onChange handler - chatInput[reactProps].onChange(syntheticEvent); - } else { - console.error("Could not find React props on chat input"); - return false; - } - - return true; - } catch (err) { - console.error("Error setting chat input value:", err); - return false; - } + // Append to the existing value + chatInput.value = chatInput.value + ' ' + text; + // Call React's onChange handler + chatInput[reactProps].onChange(syntheticEvent); + return true; } // Initialize speech recognition From e568604422ed05824eafcc708466926e62e49a47 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Wed, 14 May 2025 13:47:26 +1000 Subject: [PATCH 08/10] add auto-transcription mode --- aider/gui_speech_to_text.js | 83 +++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 8cd99d2d1..a7201b191 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -30,9 +30,42 @@ button.textContent = 'Voice Input'; button.style.padding = '4px 8px'; + // Create stop button (initially hidden) + const stopButton = document.createElement('button'); + stopButton.id = 'stop-button-' + compId; + stopButton.textContent = 'Stop'; + stopButton.style.padding = '4px 8px'; + stopButton.style.marginLeft = '5px'; + stopButton.style.display = 'none'; + + // Create checkbox and label container + const checkContainer = document.createElement('div'); + checkContainer.style.display = 'flex'; + checkContainer.style.alignItems = 'center'; + checkContainer.style.marginLeft = '10px'; + + // Create auto-transcribe checkbox + const autoTranscribe = document.createElement('input'); + autoTranscribe.id = 'auto-transcribe-' + compId; + autoTranscribe.type = 'checkbox'; + autoTranscribe.style.marginRight = '5px'; + + // Create label for checkbox + const label = document.createElement('label'); + label.htmlFor = autoTranscribe.id; + label.textContent = 'Auto Transcribe'; + label.style.fontSize = '14px'; + label.style.color = 'white'; + + // Assemble components + checkContainer.appendChild(autoTranscribe); + checkContainer.appendChild(label); + // Add elements to container container.appendChild(led); container.appendChild(button); + container.appendChild(stopButton); + container.appendChild(checkContainer); // Check if browser supports the Web Speech API if (!('webkitSpeechRecognition' in window) && !('SpeechRecognition' in window)) { @@ -67,6 +100,7 @@ // Initialize speech recognition const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; const recognition = new SpeechRecognition(); + let isListening = false; recognition.continuous = false; recognition.interimResults = false; @@ -76,15 +110,38 @@ // Setup button click handler button.addEventListener('click', function() { + if (isListening) return; + + isListening = true; led.style.backgroundColor = 'lime'; recognition.start(); + + // Show stop button if auto transcribe is enabled + if (autoTranscribe.checked) { + stopButton.style.display = 'inline-block'; + recognition.continuous = true; + } }); + // Setup stop button click handler + stopButton.addEventListener('click', function() { + if (isListening) { + recognition.stop(); + stopButton.style.display = 'none'; + isListening = false; + } + }); + + // Handle speech detection + recognition.onspeechstart = function() { + console.log('Speech detected'); + if (autoTranscribe.checked) { + led.style.backgroundColor = 'red'; + } + }; + // Combined event handler function for speech recognition events function handleSpeechEvent(eventType, event) { - // Set LED back to gray for all events - led.style.backgroundColor = 'gray'; - if (eventType === 'result') { const transcript = event.results[0][0].transcript; @@ -92,11 +149,29 @@ const success = populateChatInput(transcript); if (!success) console.error('populateChatInput failed'); + + // If not in auto-transcribe mode, reset the LED + if (!autoTranscribe.checked) { + led.style.backgroundColor = 'gray'; + } } else if (eventType === 'error') { console.error('Speech recognition error', event.error); + isListening = false; + stopButton.style.display = 'none'; + led.style.backgroundColor = 'gray'; + } + else if (eventType === 'end') { + // If auto transcribe is enabled and we're still supposed to be listening, + // restart recognition + if (autoTranscribe.checked && isListening) { + setTimeout(() => recognition.start(), 100); + } else { + isListening = false; + stopButton.style.display = 'none'; + led.style.backgroundColor = 'gray'; + } } - // 'end' event requires no special handling beyond resetting the LED } // Set up event handlers using the combined function From bfdca0509193e57d10832cf522d9bf2d3de968b9 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Wed, 14 May 2025 13:51:11 +1000 Subject: [PATCH 09/10] auto transcription with led changes --- aider/gui_speech_to_text.js | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index a7201b191..81b08ad56 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -113,14 +113,17 @@ if (isListening) return; isListening = true; - led.style.backgroundColor = 'lime'; - recognition.start(); - // Show stop button if auto transcribe is enabled + // Set initial LED color based on auto-transcribe mode if (autoTranscribe.checked) { + led.style.backgroundColor = 'red'; // Red when waiting for voice stopButton.style.display = 'inline-block'; recognition.continuous = true; + } else { + led.style.backgroundColor = 'lime'; } + + recognition.start(); }); // Setup stop button click handler @@ -136,14 +139,24 @@ recognition.onspeechstart = function() { console.log('Speech detected'); if (autoTranscribe.checked) { - led.style.backgroundColor = 'red'; + led.style.backgroundColor = 'green'; // Green when voice is detected + } + }; + + // Handle speech end + recognition.onspeechend = function() { + console.log('Speech ended'); + if (autoTranscribe.checked && isListening) { + led.style.backgroundColor = 'red'; // Red when waiting for voice } }; // Combined event handler function for speech recognition events function handleSpeechEvent(eventType, event) { if (eventType === 'result') { - const transcript = event.results[0][0].transcript; + // Get the latest transcript + const resultIndex = event.resultIndex; + const transcript = event.results[resultIndex][0].transcript; // Try to populate the chat input directly const success = populateChatInput(transcript); @@ -153,6 +166,8 @@ // If not in auto-transcribe mode, reset the LED if (!autoTranscribe.checked) { led.style.backgroundColor = 'gray'; + } else { + led.style.backgroundColor = 'red'; // Back to red after processing result } } else if (eventType === 'error') { From adeb0dd4dddd3d69b0707d1a2d165486e612f194 Mon Sep 17 00:00:00 2001 From: Matt Flax Date: Wed, 14 May 2025 14:02:03 +1000 Subject: [PATCH 10/10] LED stays green in autotranscribe mode --- aider/gui_speech_to_text.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aider/gui_speech_to_text.js b/aider/gui_speech_to_text.js index 81b08ad56..a8a6fb3e5 100644 --- a/aider/gui_speech_to_text.js +++ b/aider/gui_speech_to_text.js @@ -139,7 +139,7 @@ recognition.onspeechstart = function() { console.log('Speech detected'); if (autoTranscribe.checked) { - led.style.backgroundColor = 'green'; // Green when voice is detected + led.style.backgroundColor = 'lime'; // Lime green when voice is detected } }; @@ -166,9 +166,9 @@ // If not in auto-transcribe mode, reset the LED if (!autoTranscribe.checked) { led.style.backgroundColor = 'gray'; - } else { - led.style.backgroundColor = 'red'; // Back to red after processing result } + // In auto-transcribe mode, we'll keep the LED color as is (lime while speaking) + // The LED will be set back to red in the speechend event } else if (eventType === 'error') { console.error('Speech recognition error', event.error);