remove redurant timing fields, fix not working timings output

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
This commit is contained in:
mintyleaf 2025-01-10 04:25:13 +04:00
parent f040aa46a3
commit 625029cc96
8 changed files with 16 additions and 45 deletions

View file

@ -159,10 +159,8 @@ message Reply {
bytes message = 1;
int32 tokens = 2;
int32 prompt_tokens = 3;
int32 timing_prompt_tokens = 4;
int32 timing_predicted_tokens = 5;
double timing_prompt_processing = 6;
double timing_token_generation = 7;
double timing_prompt_processing = 4;
double timing_token_generation = 5;
}
message ModelOptions {

View file

@ -2414,14 +2414,12 @@ public:
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
reply.set_prompt_tokens(tokens_evaluated);
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0);
reply.set_timing_prompt_tokens(timing_prompt_tokens);
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
reply.set_timing_predicted_tokens(timing_predicted_tokens);
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
if (result.result_json.contains("timings")) {
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
reply.set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0);
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply.set_timing_token_generation(timing_token_generation);
}
// Log Request Correlation Id
LOG_VERBOSE("correlation:", {
@ -2464,15 +2462,13 @@ public:
reply->set_tokens(tokens_predicted);
reply->set_message(completion_text);
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0);
reply->set_timing_prompt_tokens(timing_prompt_tokens);
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
reply->set_timing_predicted_tokens(timing_predicted_tokens);
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
if (result.result_json.contains("timings")) {
double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
reply->set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0);
double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply->set_timing_token_generation(timing_token_generation);
}
}
else
{
return grpc::Status::OK;

View file

@ -29,8 +29,6 @@ type LLMResponse struct {
type TokenUsage struct {
Prompt int
Completion int
TimingPromptTokens int
TimingPredictedTokens int
TimingPromptProcessing float64
TimingTokenGeneration float64
}
@ -127,8 +125,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.Prompt = int(reply.PromptTokens)
tokenUsage.Completion = int(reply.Tokens)
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
@ -166,8 +162,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.Completion = int(reply.Tokens)
}
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

View file

@ -47,8 +47,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
@ -104,8 +102,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
@ -476,12 +472,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
fmt.Println(tokenUsage)
resp := &schema.OpenAIResponse{
ID: id,

View file

@ -38,8 +38,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
}
@ -182,8 +180,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
return err
}
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
@ -195,8 +191,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
}

View file

@ -66,8 +66,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
totalTokenUsage.Prompt += tokenUsage.Prompt
totalTokenUsage.Completion += tokenUsage.Completion
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
@ -79,8 +77,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
}
if extraUsage {
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
}

View file

@ -52,6 +52,8 @@ func ComputeChoices(
tokenUsage.Prompt += prediction.Usage.Prompt
tokenUsage.Completion += prediction.Usage.Completion
tokenUsage.TimingPromptProcessing += prediction.Usage.TimingPromptProcessing
tokenUsage.TimingTokenGeneration += prediction.Usage.TimingTokenGeneration
finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
cb(finetunedResponse, &result)

View file

@ -24,8 +24,6 @@ type OpenAIUsage struct {
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
// Extra timing data, disabled by default as is't not a part of OpenAI specification
TimingPromptTokens int `json:"timing_prompt_tokens,omitempty"`
TimingPredictedTokens int `json:"timing_predicted_tokens,omitempty"`
TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"`
TimingTokenGeneration float64 `json:"timing_token_generation,omitempty"`
}