feat: add machine tag and inference timings (#4577)

* Add machine tag option, add extraUsage option, grpc-server -> proto -> endpoint extraUsage data is broken for now

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

* remove redurant timing fields, fix not working timings output

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

* use middleware for Machine-Tag only if tag is specified

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

---------

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
This commit is contained in:
mintyleaf 2025-01-17 20:05:58 +04:00 committed by GitHub
parent 8027fdf1c7
commit 96f8ec0402
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 137 additions and 48 deletions

View file

@ -27,8 +27,10 @@ type LLMResponse struct {
}
type TokenUsage struct {
Prompt int
Completion int
Prompt int
Completion int
TimingPromptProcessing float64
TimingTokenGeneration float64
}
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
@ -123,6 +125,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.Prompt = int(reply.PromptTokens)
tokenUsage.Completion = int(reply.Tokens)
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
for len(partialRune) > 0 {
r, size := utf8.DecodeRune(partialRune)
@ -157,6 +161,10 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
if tokenUsage.Completion == 0 {
tokenUsage.Completion = int(reply.Tokens)
}
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
return LLMResponse{
Response: string(reply.Message),
Usage: tokenUsage,