feat: include tokens usage for streamed output (#4282)

Use pb.Reply instead of []byte with Reply.GetMessage() in llama grpc to get the proper usage data in reply streaming mode at the last [DONE] frame

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
mintyleaf 2024-11-28 17:47:56 +04:00 committed by GitHub
parent e001fada6c
commit 0d6c3a7d57
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 25 additions and 10 deletions

View file

@ -117,8 +117,12 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
ss := ""
var partialRune []byte
err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
partialRune = append(partialRune, chars...)
err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
msg := reply.GetMessage()
partialRune = append(partialRune, msg...)
tokenUsage.Prompt = int(reply.PromptTokens)
tokenUsage.Completion = int(reply.Tokens)
for len(partialRune) > 0 {
r, size := utf8.DecodeRune(partialRune)
@ -132,6 +136,10 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
partialRune = partialRune[size:]
}
if len(msg) == 0 {
tokenCallback("", tokenUsage)
}
})
return LLMResponse{
Response: ss,