feat: add machine tag and inference timings (#4577)

* Add machine tag option, add extraUsage option, grpc-server -> proto -> endpoint extraUsage data is broken for now Signed-off-by: mintyleaf <mintyleafdev@gmail.com> * remove redurant timing fields, fix not working timings output Signed-off-by: mintyleaf <mintyleafdev@gmail.com> * use middleware for Machine-Tag only if tag is specified Signed-off-by: mintyleaf <mintyleafdev@gmail.com> --------- Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
2025-05-30 15:35:01 +00:00 · 2025-01-17 20:05:58 +04:00 · 2025-01-17 20:05:58 +04:00 · 96f8ec0402
commit 96f8ec0402
parent 8027fdf1c7
15 changed files with 137 additions and 48 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -27,8 +27,10 @@ type LLMResponse struct {
 }

 type TokenUsage struct {
-	Prompt     int
-	Completion int
+	Prompt                 int
+	Completion             int
+	TimingPromptProcessing float64
+	TimingTokenGeneration  float64
 }

 func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
@ -123,6 +125,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im

 				tokenUsage.Prompt = int(reply.PromptTokens)
 				tokenUsage.Completion = int(reply.Tokens)
+				tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
+				tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@ -157,6 +161,10 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			if tokenUsage.Completion == 0 {
 				tokenUsage.Completion = int(reply.Tokens)
 			}
+
+			tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
+			tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
+
 			return LLMResponse{
 				Response: string(reply.Message),
 				Usage:    tokenUsage,