Improve audio detection

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 10:35:01 +00:00 · 2025-01-09 19:37:18 +01:00 · 2025-01-09 19:37:18 +01:00 · 30e3c47598
commit 30e3c47598
parent 01aace3017
1 changed files with 135 additions and 105 deletions
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@ -13,7 +13,6 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/websocket/v2"
 	"github.com/mudler/LocalAI/core/application"
-	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
@ -138,6 +137,8 @@ func registerRealtime(application *application.Application) func(c *websocket.Co
 			model = "gpt-4o"
 		}

+		log.Info().Msgf("New session with model: %s", model)
+
 		sessionID := generateSessionID()
 		session := &Session{
 			ID:            sessionID,
@ -489,7 +490,14 @@ func updateSession(session *Session, update *Session, cl *config.BackendConfigLo
 const (
 	minMicVolume   = 450
 	sendToVADDelay = time.Second
-	maxWhisperSegmentDuration = time.Second * 15
+)
+
+type VADState int
+
+const (
+	StateSilence VADState = iota
+	StateSpeaking
+	StateTrailingSilence
 )

 // handle VAD (Voice Activity Detection)
@ -503,7 +511,8 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 		cancel()
 	}()

-	audioDetected := false
+	vadState := VADState(StateSilence)
+	segments := []*proto.VADSegment{}
 	timeListening := time.Now()

 	// Implement VAD logic here
@ -520,15 +529,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio

 			if len(session.InputAudioBuffer) > 0 {

-				if audioDetected && time.Since(timeListening) < maxWhisperSegmentDuration {
-					log.Debug().Msgf("VAD detected speech, but still listening")
-					// audioDetected = false
-					// keep listening
-					session.AudioBufferLock.Unlock()
-					continue
-				}
-
-				if audioDetected {
+				if vadState == StateTrailingSilence {
 					log.Debug().Msgf("VAD detected speech that we can process")

 					// Commit the audio buffer as a conversation item
@ -561,7 +562,8 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 						Item: item,
 					})

-					audioDetected = false
+					vadState = StateSilence
+					segments = []*proto.VADSegment{}
 					// Generate a response
 					generateResponse(cfg, evaluator, session, conversation, ResponseCreate{}, c, websocket.TextMessage)
 					continue
@ -570,7 +572,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				adata := sound.BytesToInt16sLE(session.InputAudioBuffer)

 				// Resample from 24kHz to 16kHz
-				adata = sound.ResampleInt16(adata, 24000, 16000)
+				//	adata = sound.ResampleInt16(adata, 24000, 16000)

 				soundIntBuffer := &audio.IntBuffer{
 					Format: &audio.Format{SampleRate: 16000, NumChannels: 1},
@ -582,9 +584,20 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 					session.AudioBufferLock.Unlock()
 					continue
 				} */
-
 				float32Data := soundIntBuffer.AsFloat32Buffer().Data

+				// TODO: testing wav decoding
+				// dec := wav.NewDecoder(bytes.NewReader(session.InputAudioBuffer))
+				// buf, err := dec.FullPCMBuffer()
+				// if err != nil {
+				// 	//log.Error().Msgf("failed to process audio: %s", err.Error())
+				// 	sendError(c, "processing_error", "Failed to process audio: "+err.Error(), "", "")
+				// 	session.AudioBufferLock.Unlock()
+				// 	continue
+				// }
+
+				//float32Data = buf.AsFloat32Buffer().Data
+
 				resp, err := session.ModelInterface.VAD(vadContext, &proto.VADRequest{
 					Audio: float32Data,
 				})
@ -598,20 +611,34 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				if len(resp.Segments) == 0 {
 					log.Debug().Msg("VAD detected no speech activity")
 					log.Debug().Msgf("audio length %d", len(session.InputAudioBuffer))
-
-					if !audioDetected {
+					if len(session.InputAudioBuffer) > 16000 {
 						session.InputAudioBuffer = nil
+						segments = []*proto.VADSegment{}
 					}
+
 					log.Debug().Msgf("audio length(after) %d", len(session.InputAudioBuffer))
+				} else if (len(resp.Segments) != len(segments)) && vadState == StateSpeaking {
+					// We have new segments, but we are still speaking
+					// We need to wait for the trailing silence

-					session.AudioBufferLock.Unlock()
-					continue
-				}
+					segments = resp.Segments

-				if !audioDetected {
-					timeListening = time.Now()
+				} else if (len(resp.Segments) == len(segments)) && vadState == StateSpeaking {
+					// We have the same number of segments, but we are still speaking
+					// We need to check if we are in this state for long enough, update the timer
+
+					// Check if we have been listening for too long
+					if time.Since(timeListening) > sendToVADDelay {
+						vadState = StateTrailingSilence
+					} else {
+
+						timeListening = timeListening.Add(time.Since(timeListening))
+					}
+				} else {
+					log.Debug().Msg("VAD detected speech activity")
+					vadState = StateSpeaking
+					segments = resp.Segments
 				}
-				audioDetected = true

 				session.AudioBufferLock.Unlock()
 			} else {
@ -843,6 +870,7 @@ func processTextResponse(config *config.BackendConfig, session *Session, prompt
 	// Replace this with actual model inference logic using session.Model and prompt
 	// For example, the model might return a special token or JSON indicating a function call

+	/*
 		predFunc, err := backend.ModelInference(context.Background(), prompt, input.Messages, images, videos, audios, ml, *config, o, nil)

 		result, tokenUsage, err := ComputeChoices(input, prompt, config, startupOptions, ml, func(s string, c *[]schema.Choice) {
@ -939,6 +967,8 @@ func processTextResponse(config *config.BackendConfig, session *Session, prompt
 		// Return the prediction in the response body
 		return c.JSON(resp)

+	*/
+
 	// TODO: use session.ModelInterface...
 	// Simulate a function call
 	if strings.Contains(prompt, "weather") {