From 45c58752e573b9ec12d734ce96534e4d55bf83f9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 30 May 2025 16:47:31 +0200
Subject: [PATCH] feat(ui): add audio upload button in chat view (#5526)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/static/chat.js  | 340 +++++++++++++++++++-------------------
 core/http/views/chat.html |  20 ++-
 go.mod                    |   1 +
 go.sum                    |   2 +
 pkg/utils/base64.go       |  18 +-
 5 files changed, 201 insertions(+), 180 deletions(-)

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index 0dce445b..34f582aa 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -49,12 +49,13 @@ function submitSystemPrompt(event) {
 }
 
 var image = "";
+var audio = "";
 
 function submitPrompt(event) {
   event.preventDefault();
 
   const input = document.getElementById("input").value;
-  Alpine.store("chat").add("user", input, image);
+  Alpine.store("chat").add("user", input, image, audio);
   document.getElementById("input").value = "";
   const systemPrompt = localStorage.getItem("system_prompt");
   Alpine.nextTick(() => { document.getElementById('messages').scrollIntoView(false); });
@@ -62,7 +63,6 @@ function submitPrompt(event) {
 }
 
 function readInputImage() {
-
   if (!this.files || !this.files[0]) return;
 
   const FR = new FileReader();
@@ -74,35 +74,47 @@ function readInputImage() {
   FR.readAsDataURL(this.files[0]);
 }
 
+function readInputAudio() {
+  if (!this.files || !this.files[0]) return;
 
-  async function promptGPT(systemPrompt, input) {
-    const model = document.getElementById("chat-model").value;
-    // Set class "loader" to the element with "loader" id
-    //document.getElementById("loader").classList.add("loader");
-    // Make the "loader" visible
-    toggleLoader(true);
+  const FR = new FileReader();
 
+  FR.addEventListener("load", function(evt) {
+    audio = evt.target.result;
+  });
 
-    messages = Alpine.store("chat").messages();
+  FR.readAsDataURL(this.files[0]);
+}
 
-    // if systemPrompt isn't empty, push it at the start of messages
-    if (systemPrompt) {
-      messages.unshift({
-        role: "system",
-        content: systemPrompt
-      });
-    }
+async function promptGPT(systemPrompt, input) {
+  const model = document.getElementById("chat-model").value;
+  // Set class "loader" to the element with "loader" id
+  //document.getElementById("loader").classList.add("loader");
+  // Make the "loader" visible
+  toggleLoader(true);
 
-    // loop all messages, and check if there are images. If there are, we need to change the content field
-    messages.forEach((message) => {
+  messages = Alpine.store("chat").messages();
+
+  // if systemPrompt isn't empty, push it at the start of messages
+  if (systemPrompt) {
+    messages.unshift({
+      role: "system",
+      content: systemPrompt
+    });
+  }
+
+  // loop all messages, and check if there are images or audios. If there are, we need to change the content field
+  messages.forEach((message) => {
+    if (message.image || message.audio) {
+      // The content field now becomes an array
+      message.content = [
+        {
+          "type": "text",
+          "text": message.content
+        }
+      ]
+      
       if (message.image) {
-        // The content field now becomes an array
-        message.content = [
-          {
-            "type": "text",
-            "text": message.content
-          }
-        ]
         message.content.push(
           {
             "type": "image_url",
@@ -111,168 +123,154 @@ function readInputImage() {
             }
           }
         );
-
-        // remove the image field
         delete message.image;
       }
-    });
 
-       // reset the form and the image
-       image = "";
-       document.getElementById("input_image").value = null;
-       document.getElementById("fileName").innerHTML = "";
-
-    // if (image) {
-    //   // take the last element content's and add the image
-    //   last_message = messages[messages.length - 1]
-    //   // The content field now becomes an array
-    //   last_message.content = [
-    //     {
-    //       "type": "text",
-    //       "text": last_message.content
-    //     }
-    //    ]
-    //   last_message.content.push(
-    //     {
-    //       "type": "image_url",
-    //       "image_url": {
-    //         "url": image,
-    //       }
-    //     }
-    //   );
-    //   // and we replace it in the messages array
-    //   messages[messages.length - 1] = last_message
-
-    //   // reset the form and the image
-    //   image = "";
-    //   document.getElementById("input_image").value = null;
-    //   document.getElementById("fileName").innerHTML = "";
-    // }
-
-    // Source: https://stackoverflow.com/a/75751803/11386095
-    const response = await fetch("v1/chat/completions", {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-      },
-      body: JSON.stringify({
-        model: model,
-        messages: messages,
-        stream: true,
-      }),
-    });
-
-    if (!response.ok) {
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Error: POST /v1/chat/completions ${response.status}</span>`,
-      );
-      return;
-    }
-
-    const reader = response.body
-      ?.pipeThrough(new TextDecoderStream())
-      .getReader();
-
-    if (!reader) {
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Error: Failed to decode API response</span>`,
-      );
-      return;
-    }
-
-    // Function to add content to the chat and handle DOM updates efficiently
-    const addToChat = (token) => {
-      const chatStore = Alpine.store("chat");
-      chatStore.add("assistant", token);
-      // Efficiently scroll into view without triggering multiple reflows
-      // const messages = document.getElementById('messages');
-      // messages.scrollTop = messages.scrollHeight;
-    };
-
-    let buffer = "";
-    let contentBuffer = [];
-
-    try {
-      while (true) {
-        const { value, done } = await reader.read();
-        if (done) break;
-
-        buffer += value;
-
-        let lines = buffer.split("\n");
-        buffer = lines.pop(); // Retain any incomplete line in the buffer
-
-        lines.forEach((line) => {
-          if (line.length === 0 || line.startsWith(":")) return;
-          if (line === "data: [DONE]") {
-            return;
-          }
-
-          if (line.startsWith("data: ")) {
-            try {
-              const jsonData = JSON.parse(line.substring(6));
-              const token = jsonData.choices[0].delta.content;
-
-              if (token) {
-                contentBuffer.push(token);
-              }
-            } catch (error) {
-              console.error("Failed to parse line:", line, error);
+      if (message.audio) {
+        message.content.push(
+          {
+            "type": "audio_url",
+            "audio_url": {
+              "url": message.audio,
             }
           }
-        });
-
-        // Efficiently update the chat in batch
-        if (contentBuffer.length > 0) {
-          addToChat(contentBuffer.join(""));
-          contentBuffer = [];
-        }
+        );
+        delete message.audio;
       }
+    }
+  });
 
-      // Final content flush if any data remains
+  // reset the form and the files
+  image = "";
+  audio = "";
+  document.getElementById("input_image").value = null;
+  document.getElementById("input_audio").value = null;
+  document.getElementById("fileName").innerHTML = "";
+
+  // Source: https://stackoverflow.com/a/75751803/11386095
+  const response = await fetch("v1/chat/completions", {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({
+      model: model,
+      messages: messages,
+      stream: true,
+    }),
+  });
+
+  if (!response.ok) {
+    Alpine.store("chat").add(
+      "assistant",
+      `<span class='error'>Error: POST /v1/chat/completions ${response.status}</span>`,
+    );
+    return;
+  }
+
+  const reader = response.body
+    ?.pipeThrough(new TextDecoderStream())
+    .getReader();
+
+  if (!reader) {
+    Alpine.store("chat").add(
+      "assistant",
+      `<span class='error'>Error: Failed to decode API response</span>`,
+    );
+    return;
+  }
+
+  // Function to add content to the chat and handle DOM updates efficiently
+  const addToChat = (token) => {
+    const chatStore = Alpine.store("chat");
+    chatStore.add("assistant", token);
+    // Efficiently scroll into view without triggering multiple reflows
+    // const messages = document.getElementById('messages');
+    // messages.scrollTop = messages.scrollHeight;
+  };
+
+  let buffer = "";
+  let contentBuffer = [];
+
+  try {
+    while (true) {
+      const { value, done } = await reader.read();
+      if (done) break;
+
+      buffer += value;
+
+      let lines = buffer.split("\n");
+      buffer = lines.pop(); // Retain any incomplete line in the buffer
+
+      lines.forEach((line) => {
+        if (line.length === 0 || line.startsWith(":")) return;
+        if (line === "data: [DONE]") {
+          return;
+        }
+
+        if (line.startsWith("data: ")) {
+          try {
+            const jsonData = JSON.parse(line.substring(6));
+            const token = jsonData.choices[0].delta.content;
+
+            if (token) {
+              contentBuffer.push(token);
+            }
+          } catch (error) {
+            console.error("Failed to parse line:", line, error);
+          }
+        }
+      });
+
+      // Efficiently update the chat in batch
       if (contentBuffer.length > 0) {
         addToChat(contentBuffer.join(""));
+        contentBuffer = [];
       }
-
-      // Highlight all code blocks once at the end
-      hljs.highlightAll();
-    } catch (error) {
-      console.error("An error occurred while reading the stream:", error);
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Error: Failed to process stream</span>`,
-      );
-    } finally {
-      // Perform any cleanup if necessary
-      reader.releaseLock();
     }
 
-    // Remove class "loader" from the element with "loader" id
-    toggleLoader(false);
+    // Final content flush if any data remains
+    if (contentBuffer.length > 0) {
+      addToChat(contentBuffer.join(""));
+    }
 
-    // scroll to the bottom of the chat
-    document.getElementById('messages').scrollIntoView(false)
-    // set focus to the input
-    document.getElementById("input").focus();
+    // Highlight all code blocks once at the end
+    hljs.highlightAll();
+  } catch (error) {
+    console.error("An error occurred while reading the stream:", error);
+    Alpine.store("chat").add(
+      "assistant",
+      `<span class='error'>Error: Failed to process stream</span>`,
+    );
+  } finally {
+    // Perform any cleanup if necessary
+    reader.releaseLock();
   }
 
-  document.getElementById("system_prompt").addEventListener("submit", submitSystemPrompt);
+  // Remove class "loader" from the element with "loader" id
+  toggleLoader(false);
 
-  document.getElementById("prompt").addEventListener("submit", submitPrompt);
+  // scroll to the bottom of the chat
+  document.getElementById('messages').scrollIntoView(false)
+  // set focus to the input
   document.getElementById("input").focus();
-  document.getElementById("input_image").addEventListener("change", readInputImage);
+}
 
-  storesystemPrompt = localStorage.getItem("system_prompt");
-  if (storesystemPrompt) {
-    document.getElementById("systemPrompt").value = storesystemPrompt;
-  } else {
-    document.getElementById("systemPrompt").value = null;
-  }
+document.getElementById("system_prompt").addEventListener("submit", submitSystemPrompt);
+document.getElementById("prompt").addEventListener("submit", submitPrompt);
+document.getElementById("input").focus();
+document.getElementById("input_image").addEventListener("change", readInputImage);
+document.getElementById("input_audio").addEventListener("change", readInputAudio);
 
-  marked.setOptions({
-    highlight: function (code) {
-      return hljs.highlightAuto(code).value;
-    },
-  });
+storesystemPrompt = localStorage.getItem("system_prompt");
+if (storesystemPrompt) {
+  document.getElementById("systemPrompt").value = storesystemPrompt;
+} else {
+  document.getElementById("systemPrompt").value = null;
+}
+
+marked.setOptions({
+  highlight: function (code) {
+    return hljs.highlightAuto(code).value;
+  },
+});
diff --git a/core/http/views/chat.html b/core/http/views/chat.html
index 66e9b1da..053385d1 100644
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -218,6 +218,8 @@ SOFTWARE.
             Start chatting with the AI by typing a prompt in the input field below and pressing Enter.
             For models that support images, you can upload an image by clicking the paperclip
             <i class="fa-solid fa-paperclip"></i> icon.
+            For models that support audio, you can upload an audio file by clicking the microphone
+            <i class="fa-solid fa-microphone"></i> icon.
           </p>
           <div id="messages" class="max-w-3xl mx-auto">
             <template x-for="message in history">
@@ -290,6 +292,12 @@ SOFTWARE.
                   class="fa-solid fa-paperclip text-gray-400 absolute right-12 top-4 text-lg p-2 hover:text-blue-400 transition-colors duration-200"
                   title="Attach an image"
                 ></button>
+                <button
+                  type="button"
+                  onclick="document.getElementById('input_audio').click()"
+                  class="fa-solid fa-microphone text-gray-400 absolute right-20 top-4 text-lg p-2 hover:text-blue-400 transition-colors duration-200"
+                  title="Attach an audio file"
+                ></button>
                 
                 <!-- Send button and loader in the same position -->
                 <div class="absolute right-3 top-4">
@@ -320,6 +328,13 @@ SOFTWARE.
               style="display: none;"
               @change="fileName = $event.target.files[0].name"
             />
+            <input
+              id="input_audio"
+              type="file"
+              accept="audio/*"
+              style="display: none;"
+              @change="fileName = $event.target.files[0].name"
+            />
           </div>
           </form>
         </div>
@@ -381,7 +396,7 @@ SOFTWARE.
           clear() {
             this.history.length = 0;
           },
-          add(role, content, image) {
+          add(role, content, image, audio) {
             const N = this.history.length - 1;
             if (this.history.length && this.history[N].role === role) {
               this.history[N].content += content;
@@ -394,7 +409,7 @@ SOFTWARE.
               lines.forEach((line) => {
                 c += DOMPurify.sanitize(marked.parse(line));
               });
-              this.history.push({ role, content, html: c, image });
+              this.history.push({ role, content, html: c, image, audio });
             }
             document.getElementById('messages').scrollIntoView(false);
             const parser = new DOMParser();
@@ -418,6 +433,7 @@ SOFTWARE.
               role: message.role,
               content: message.content,
               image: message.image,
+              audio: message.audio,
             }));
           },
         });
diff --git a/go.mod b/go.mod
index 4d1079cb..57fc0947 100644
--- a/go.mod
+++ b/go.mod
@@ -82,6 +82,7 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/morikuni/aec v1.0.0 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e // indirect
 	github.com/pion/datachannel v1.5.10 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
 	github.com/pion/dtls/v3 v3.0.4 // indirect
diff --git a/go.sum b/go.sum
index dae92fae..5ebe9ac1 100644
--- a/go.sum
+++ b/go.sum
@@ -546,6 +546,8 @@ github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/
 github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
 github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
 github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8=
+github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e h1:s2RNOM/IGdY0Y6qfTeUKhDawdHDpK9RGBdx80qN4Ttw=
+github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e/go.mod h1:nBdnFKj15wFbf94Rwfq4m30eAcyY9V/IyKAGQFtqkW0=
 github.com/otiai10/mint v1.6.1 h1:kgbTJmOpp/0ce7hk3H8jiSuR0MXmpwWRfqUdKww17qg=
 github.com/otiai10/mint v1.6.1/go.mod h1:MJm72SBthJjz8qhefc4z1PYEieWmy8Bku7CjcAqyUSM=
 github.com/otiai10/openaigo v1.7.0 h1:AOQcOjRRM57ABvz+aI2oJA/Qsz1AydKbdZAlGiKyCqg=
diff --git a/pkg/utils/base64.go b/pkg/utils/base64.go
index 50109eaa..174e80fa 100644
--- a/pkg/utils/base64.go
+++ b/pkg/utils/base64.go
@@ -5,14 +5,19 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"regexp"
 	"strings"
 	"time"
+
+	"github.com/rs/zerolog/log"
 )
 
 var base64DownloadClient http.Client = http.Client{
 	Timeout: 30 * time.Second,
 }
 
+var dataURIPattern = regexp.MustCompile(`^data:([^;]+);base64,`)
+
 // GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
 func GetContentURIAsBase64(s string) (string, error) {
 	if strings.HasPrefix(s, "http") {
@@ -36,12 +41,11 @@ func GetContentURIAsBase64(s string) (string, error) {
 		return encoded, nil
 	}
 
-	// if the string instead is prefixed with "data:image/...;base64,", drop it
-	dropPrefix := []string{"data:image/jpeg;base64,", "data:image/png;base64,"}
-	for _, prefix := range dropPrefix {
-		if strings.HasPrefix(s, prefix) {
-			return strings.ReplaceAll(s, prefix, ""), nil
-		}
+	// Match any data URI prefix pattern
+	if match := dataURIPattern.FindString(s); match != "" {
+		log.Debug().Msgf("Found data URI prefix: %s", match)
+		return strings.Replace(s, match, "", 1), nil
 	}
-	return "", fmt.Errorf("not valid string")
+
+	return "", fmt.Errorf("not valid base64 data type string")
 }