mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-29 22:20:43 +00:00
remove redurant timing fields, fix not working timings output
Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
This commit is contained in:
parent
f040aa46a3
commit
625029cc96
8 changed files with 16 additions and 45 deletions
|
@ -159,10 +159,8 @@ message Reply {
|
|||
bytes message = 1;
|
||||
int32 tokens = 2;
|
||||
int32 prompt_tokens = 3;
|
||||
int32 timing_prompt_tokens = 4;
|
||||
int32 timing_predicted_tokens = 5;
|
||||
double timing_prompt_processing = 6;
|
||||
double timing_token_generation = 7;
|
||||
double timing_prompt_processing = 4;
|
||||
double timing_token_generation = 5;
|
||||
}
|
||||
|
||||
message ModelOptions {
|
||||
|
|
|
@ -2414,14 +2414,12 @@ public:
|
|||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
reply.set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0);
|
||||
reply.set_timing_prompt_tokens(timing_prompt_tokens);
|
||||
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
|
||||
reply.set_timing_predicted_tokens(timing_predicted_tokens);
|
||||
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
|
||||
reply.set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0);
|
||||
reply.set_timing_token_generation(timing_token_generation);
|
||||
if (result.result_json.contains("timings")) {
|
||||
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
|
||||
reply.set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
|
||||
reply.set_timing_token_generation(timing_token_generation);
|
||||
}
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
|
@ -2464,14 +2462,12 @@ public:
|
|||
reply->set_tokens(tokens_predicted);
|
||||
reply->set_message(completion_text);
|
||||
|
||||
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0);
|
||||
reply->set_timing_prompt_tokens(timing_prompt_tokens);
|
||||
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
|
||||
reply->set_timing_predicted_tokens(timing_predicted_tokens);
|
||||
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
|
||||
reply->set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0);
|
||||
reply->set_timing_token_generation(timing_token_generation);
|
||||
if (result.result_json.contains("timings")) {
|
||||
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
|
||||
reply->set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
|
||||
reply->set_timing_token_generation(timing_token_generation);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -29,8 +29,6 @@ type LLMResponse struct {
|
|||
type TokenUsage struct {
|
||||
Prompt int
|
||||
Completion int
|
||||
TimingPromptTokens int
|
||||
TimingPredictedTokens int
|
||||
TimingPromptProcessing float64
|
||||
TimingTokenGeneration float64
|
||||
}
|
||||
|
@ -127,8 +125,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
|||
|
||||
tokenUsage.Prompt = int(reply.PromptTokens)
|
||||
tokenUsage.Completion = int(reply.Tokens)
|
||||
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
|
||||
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
|
||||
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
|
||||
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
|
||||
|
||||
|
@ -166,8 +162,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
|||
tokenUsage.Completion = int(reply.Tokens)
|
||||
}
|
||||
|
||||
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
|
||||
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
|
||||
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
|
||||
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
|
||||
|
||||
|
|
|
@ -47,8 +47,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
|
|||
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
@ -104,8 +102,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
|
|||
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
@ -476,12 +472,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
|
|||
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
||||
}
|
||||
fmt.Println(tokenUsage)
|
||||
|
||||
resp := &schema.OpenAIResponse{
|
||||
ID: id,
|
||||
|
|
|
@ -38,8 +38,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
|
|||
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
@ -182,8 +180,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
|
|||
return err
|
||||
}
|
||||
|
||||
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
|
||||
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
|
||||
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
|
||||
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
|
||||
|
||||
|
@ -195,8 +191,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
|
|||
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
|
|
@ -66,8 +66,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
|
|||
totalTokenUsage.Prompt += tokenUsage.Prompt
|
||||
totalTokenUsage.Completion += tokenUsage.Completion
|
||||
|
||||
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
|
||||
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
|
||||
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
|
||||
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
|
||||
|
||||
|
@ -79,8 +77,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
|
|||
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
|
||||
}
|
||||
if extraUsage {
|
||||
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
|
||||
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
|
||||
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
|
||||
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
|
||||
}
|
||||
|
|
|
@ -52,6 +52,8 @@ func ComputeChoices(
|
|||
|
||||
tokenUsage.Prompt += prediction.Usage.Prompt
|
||||
tokenUsage.Completion += prediction.Usage.Completion
|
||||
tokenUsage.TimingPromptProcessing += prediction.Usage.TimingPromptProcessing
|
||||
tokenUsage.TimingTokenGeneration += prediction.Usage.TimingTokenGeneration
|
||||
|
||||
finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
|
||||
cb(finetunedResponse, &result)
|
||||
|
|
|
@ -24,8 +24,6 @@ type OpenAIUsage struct {
|
|||
CompletionTokens int `json:"completion_tokens"`
|
||||
TotalTokens int `json:"total_tokens"`
|
||||
// Extra timing data, disabled by default as is't not a part of OpenAI specification
|
||||
TimingPromptTokens int `json:"timing_prompt_tokens,omitempty"`
|
||||
TimingPredictedTokens int `json:"timing_predicted_tokens,omitempty"`
|
||||
TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"`
|
||||
TimingTokenGeneration float64 `json:"timing_token_generation,omitempty"`
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue