remove redurant timing fields, fix not working timings output

Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
This commit is contained in:
mintyleaf 2025-01-10 04:25:13 +04:00
parent f040aa46a3
commit 625029cc96
8 changed files with 16 additions and 45 deletions

View file

@ -159,10 +159,8 @@ message Reply {
bytes message = 1; bytes message = 1;
int32 tokens = 2; int32 tokens = 2;
int32 prompt_tokens = 3; int32 prompt_tokens = 3;
int32 timing_prompt_tokens = 4; double timing_prompt_processing = 4;
int32 timing_predicted_tokens = 5; double timing_token_generation = 5;
double timing_prompt_processing = 6;
double timing_token_generation = 7;
} }
message ModelOptions { message ModelOptions {

View file

@ -2414,14 +2414,12 @@ public:
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
reply.set_prompt_tokens(tokens_evaluated); reply.set_prompt_tokens(tokens_evaluated);
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0); if (result.result_json.contains("timings")) {
reply.set_timing_prompt_tokens(timing_prompt_tokens); double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
reply.set_timing_predicted_tokens(timing_predicted_tokens);
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
reply.set_timing_prompt_processing(timing_prompt_processing); reply.set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0); double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply.set_timing_token_generation(timing_token_generation); reply.set_timing_token_generation(timing_token_generation);
}
// Log Request Correlation Id // Log Request Correlation Id
LOG_VERBOSE("correlation:", { LOG_VERBOSE("correlation:", {
@ -2464,15 +2462,13 @@ public:
reply->set_tokens(tokens_predicted); reply->set_tokens(tokens_predicted);
reply->set_message(completion_text); reply->set_message(completion_text);
int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0); if (result.result_json.contains("timings")) {
reply->set_timing_prompt_tokens(timing_prompt_tokens); double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0);
reply->set_timing_predicted_tokens(timing_predicted_tokens);
double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0);
reply->set_timing_prompt_processing(timing_prompt_processing); reply->set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0); double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
reply->set_timing_token_generation(timing_token_generation); reply->set_timing_token_generation(timing_token_generation);
} }
}
else else
{ {
return grpc::Status::OK; return grpc::Status::OK;

View file

@ -29,8 +29,6 @@ type LLMResponse struct {
type TokenUsage struct { type TokenUsage struct {
Prompt int Prompt int
Completion int Completion int
TimingPromptTokens int
TimingPredictedTokens int
TimingPromptProcessing float64 TimingPromptProcessing float64
TimingTokenGeneration float64 TimingTokenGeneration float64
} }
@ -127,8 +125,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.Prompt = int(reply.PromptTokens) tokenUsage.Prompt = int(reply.PromptTokens)
tokenUsage.Completion = int(reply.Tokens) tokenUsage.Completion = int(reply.Tokens)
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
@ -166,8 +162,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.Completion = int(reply.Tokens) tokenUsage.Completion = int(reply.Tokens)
} }
tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens)
tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens)
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing

View file

@ -47,8 +47,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
} }
@ -104,8 +102,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
} }
@ -476,12 +472,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
} }
fmt.Println(tokenUsage)
resp := &schema.OpenAIResponse{ resp := &schema.OpenAIResponse{
ID: id, ID: id,

View file

@ -38,8 +38,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, TotalTokens: tokenUsage.Prompt + tokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = tokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing
} }
@ -182,8 +180,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
return err return err
} }
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
@ -195,8 +191,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion, TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
} }

View file

@ -66,8 +66,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
totalTokenUsage.Prompt += tokenUsage.Prompt totalTokenUsage.Prompt += tokenUsage.Prompt
totalTokenUsage.Completion += tokenUsage.Completion totalTokenUsage.Completion += tokenUsage.Completion
totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens
totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens
totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration
totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing
@ -79,8 +77,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion, TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion,
} }
if extraUsage { if extraUsage {
usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens
usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens
usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration
usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing
} }

View file

@ -52,6 +52,8 @@ func ComputeChoices(
tokenUsage.Prompt += prediction.Usage.Prompt tokenUsage.Prompt += prediction.Usage.Prompt
tokenUsage.Completion += prediction.Usage.Completion tokenUsage.Completion += prediction.Usage.Completion
tokenUsage.TimingPromptProcessing += prediction.Usage.TimingPromptProcessing
tokenUsage.TimingTokenGeneration += prediction.Usage.TimingTokenGeneration
finetunedResponse := backend.Finetune(*config, predInput, prediction.Response) finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
cb(finetunedResponse, &result) cb(finetunedResponse, &result)

View file

@ -24,8 +24,6 @@ type OpenAIUsage struct {
CompletionTokens int `json:"completion_tokens"` CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"` TotalTokens int `json:"total_tokens"`
// Extra timing data, disabled by default as is't not a part of OpenAI specification // Extra timing data, disabled by default as is't not a part of OpenAI specification
TimingPromptTokens int `json:"timing_prompt_tokens,omitempty"`
TimingPredictedTokens int `json:"timing_predicted_tokens,omitempty"`
TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"` TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"`
TimingTokenGeneration float64 `json:"timing_token_generation,omitempty"` TimingTokenGeneration float64 `json:"timing_token_generation,omitempty"`
} }