feat(grpc): return consumed token count and update response accordingly (#2035)

Fixes: #1920
2025-05-25 13:04:59 +00:00 · 2024-04-15 19:47:11 +02:00 · 2024-04-15 19:47:11 +02:00 · e843d7df0e
commit e843d7df0e
parent de3a1a0a8e
4 changed files with 20 additions and 4 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -189,6 +189,12 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
 	} else {
 		go func() {
 			reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
+			if tokenUsage.Prompt == 0 {
+				tokenUsage.Prompt = int(reply.PromptTokens)
+			}
+			if tokenUsage.Completion == 0 {
+				tokenUsage.Completion = int(reply.Tokens)
+			}
 			if err != nil {
 				rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
 				close(rawResultChannel)