feat: add machine tag and inference timings (#4577)

* Add machine tag option, add extraUsage option, grpc-server -> proto -> endpoint extraUsage data is broken for now

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

* remove redurant timing fields, fix not working timings output

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

* use middleware for Machine-Tag only if tag is specified

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>

---------

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
This commit is contained in:
mintyleaf 2025-01-17 20:05:58 +04:00 committed by GitHub
parent 8027fdf1c7
commit 96f8ec0402
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 137 additions and 48 deletions

View file

@ -30,8 +30,17 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
id := uuid.New().String()
created := int(time.Now().Unix())
process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) {
ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
usage := schema.OpenAIUsage{
PromptTokens: tokenUsage.Prompt,
CompletionTokens: tokenUsage.Completion,
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
resp := schema.OpenAIResponse{
ID: id,
Created: created,
@ -43,11 +52,7 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
},
},
Object: "text_completion",
Usage: schema.OpenAIUsage{
PromptTokens: usage.Prompt,
CompletionTokens: usage.Completion,
TotalTokens: usage.Prompt + usage.Completion,
},
Usage: usage,
}
log.Debug().Msgf("Sending goroutine: %s", s)
@ -60,6 +65,10 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
return func(c *fiber.Ctx) error {
// Add Correlation
c.Set("X-Correlation-ID", id)
// Opt-in extra usage flag
extraUsage := c.Get("LocalAI-Extra-Usage", "") != ""
modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
if err != nil {
return fmt.Errorf("failed reading parameters from request:%w", err)
@ -113,7 +122,7 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
responses := make(chan schema.OpenAIResponse)
go process(predInput, input, config, ml, responses)
go process(predInput, input, config, ml, responses, extraUsage)
c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
@ -170,11 +179,20 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
return err
}
totalTokenUsage.Prompt += tokenUsage.Prompt
totalTokenUsage.Completion += tokenUsage.Completion
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
result = append(result, r...)
}
usage := schema.OpenAIUsage{
PromptTokens: totalTokenUsage.Prompt,
CompletionTokens: totalTokenUsage.Completion,
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
}
if extraUsage {
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
}
resp := &schema.OpenAIResponse{
ID: id,
@ -182,11 +200,7 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
Model: input.Model, // we have to return what the user sent here, due to OpenAI spec.
Choices: result,
Object: "text_completion",
Usage: schema.OpenAIUsage{
PromptTokens: totalTokenUsage.Prompt,
CompletionTokens: totalTokenUsage.Completion,
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
},
Usage: usage,
}
jsonResult, _ := json.Marshal(resp)