fix(streaming): stream complete runes

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-04 09:55:00 +00:00 · 2025-05-30 21:50:32 +02:00 · 2025-05-30 21:50:32 +02:00 · 09ea55385f
commit 09ea55385f
parent 1cc4525f15
1 changed files with 9 additions and 4 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -135,19 +135,24 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 				tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
 				tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

+				// Process complete runes and accumulate them
+				var completeRunes []byte
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
 					if r == utf8.RuneError {
 						// incomplete rune, wait for more bytes
 						break
 					}
-
-					tokenCallback(string(r), tokenUsage)
-					ss += string(r)
-
+					completeRunes = append(completeRunes, partialRune[:size]...)
 					partialRune = partialRune[size:]
 				}

+				// If we have complete runes, send them as a single token
+				if len(completeRunes) > 0 {
+					tokenCallback(string(completeRunes), tokenUsage)
+					ss += string(completeRunes)
+				}
+
 				if len(msg) == 0 {
 					tokenCallback("", tokenUsage)
 				}