From 625029cc96f19989a6f0de93ecd02d370e8fe3bd Mon Sep 17 00:00:00 2001 From: mintyleaf Date: Fri, 10 Jan 2025 04:25:13 +0400 Subject: [PATCH] remove redurant timing fields, fix not working timings output Signed-off-by: mintyleaf --- backend/backend.proto | 6 ++--- backend/cpp/llama/grpc-server.cpp | 28 ++++++++++-------------- core/backend/llm.go | 6 ----- core/http/endpoints/openai/chat.go | 7 ------ core/http/endpoints/openai/completion.go | 6 ----- core/http/endpoints/openai/edit.go | 4 ---- core/http/endpoints/openai/inference.go | 2 ++ core/schema/openai.go | 2 -- 8 files changed, 16 insertions(+), 45 deletions(-) diff --git a/backend/backend.proto b/backend/backend.proto index df21cd87..fea4214f 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -159,10 +159,8 @@ message Reply { bytes message = 1; int32 tokens = 2; int32 prompt_tokens = 3; - int32 timing_prompt_tokens = 4; - int32 timing_predicted_tokens = 5; - double timing_prompt_processing = 6; - double timing_token_generation = 7; + double timing_prompt_processing = 4; + double timing_token_generation = 5; } message ModelOptions { diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 16b4e469..486a605c 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2414,14 +2414,12 @@ public: int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0); reply.set_prompt_tokens(tokens_evaluated); - int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0); - reply.set_timing_prompt_tokens(timing_prompt_tokens); - int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0); - reply.set_timing_predicted_tokens(timing_predicted_tokens); - double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0); - reply.set_timing_prompt_processing(timing_prompt_processing); - double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0); - reply.set_timing_token_generation(timing_token_generation); + if (result.result_json.contains("timings")) { + double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0); + reply.set_timing_prompt_processing(timing_prompt_processing); + double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0); + reply.set_timing_token_generation(timing_token_generation); + } // Log Request Correlation Id LOG_VERBOSE("correlation:", { @@ -2464,14 +2462,12 @@ public: reply->set_tokens(tokens_predicted); reply->set_message(completion_text); - int32_t timing_prompt_tokens = result.result_json.value("timings", json{}).value("prompt_n", 0); - reply->set_timing_prompt_tokens(timing_prompt_tokens); - int32_t timing_predicted_tokens = result.result_json.value("timings", json{}).value("predicted_n", 0); - reply->set_timing_predicted_tokens(timing_predicted_tokens); - double timing_prompt_processing = result.result_json.value("timings", json{}).value("prompt_ms", 0.0); - reply->set_timing_prompt_processing(timing_prompt_processing); - double timing_token_generation = result.result_json.value("timings", json{}).value("predicted_ms", 0.0); - reply->set_timing_token_generation(timing_token_generation); + if (result.result_json.contains("timings")) { + double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0); + reply->set_timing_prompt_processing(timing_prompt_processing); + double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0); + reply->set_timing_token_generation(timing_token_generation); + } } else { diff --git a/core/backend/llm.go b/core/backend/llm.go index 378159aa..d91ded51 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -29,8 +29,6 @@ type LLMResponse struct { type TokenUsage struct { Prompt int Completion int - TimingPromptTokens int - TimingPredictedTokens int TimingPromptProcessing float64 TimingTokenGeneration float64 } @@ -127,8 +125,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im tokenUsage.Prompt = int(reply.PromptTokens) tokenUsage.Completion = int(reply.Tokens) - tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens) - tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens) tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing @@ -166,8 +162,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im tokenUsage.Completion = int(reply.Tokens) } - tokenUsage.TimingPredictedTokens = int(reply.TimingPredictedTokens) - tokenUsage.TimingPromptTokens = int(reply.TimingPromptTokens) tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index bbae6994..04488923 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -47,8 +47,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = tokenUsage.TimingPromptTokens usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } @@ -104,8 +102,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = tokenUsage.TimingPromptTokens usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } @@ -476,12 +472,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = tokenUsage.TimingPromptTokens usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } - fmt.Println(tokenUsage) resp := &schema.OpenAIResponse{ ID: id, diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 0ee058ff..5f3827de 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -38,8 +38,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e TotalTokens: tokenUsage.Prompt + tokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = tokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = tokenUsage.TimingPromptTokens usage.TimingTokenGeneration = tokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = tokenUsage.TimingPromptProcessing } @@ -182,8 +180,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e return err } - totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens - totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing @@ -195,8 +191,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing } diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go index e484863c..dcb37539 100644 --- a/core/http/endpoints/openai/edit.go +++ b/core/http/endpoints/openai/edit.go @@ -66,8 +66,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat totalTokenUsage.Prompt += tokenUsage.Prompt totalTokenUsage.Completion += tokenUsage.Completion - totalTokenUsage.TimingPredictedTokens += tokenUsage.TimingPredictedTokens - totalTokenUsage.TimingPromptTokens += tokenUsage.TimingPromptTokens totalTokenUsage.TimingTokenGeneration += tokenUsage.TimingTokenGeneration totalTokenUsage.TimingPromptProcessing += tokenUsage.TimingPromptProcessing @@ -79,8 +77,6 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat TotalTokens: totalTokenUsage.Prompt + totalTokenUsage.Completion, } if extraUsage { - usage.TimingPredictedTokens = totalTokenUsage.TimingPredictedTokens - usage.TimingPromptTokens = totalTokenUsage.TimingPromptTokens usage.TimingTokenGeneration = totalTokenUsage.TimingTokenGeneration usage.TimingPromptProcessing = totalTokenUsage.TimingPromptProcessing } diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index da75d3a1..f59e3b60 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -52,6 +52,8 @@ func ComputeChoices( tokenUsage.Prompt += prediction.Usage.Prompt tokenUsage.Completion += prediction.Usage.Completion + tokenUsage.TimingPromptProcessing += prediction.Usage.TimingPromptProcessing + tokenUsage.TimingTokenGeneration += prediction.Usage.TimingTokenGeneration finetunedResponse := backend.Finetune(*config, predInput, prediction.Response) cb(finetunedResponse, &result) diff --git a/core/schema/openai.go b/core/schema/openai.go index c339f6ac..b06120ae 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -24,8 +24,6 @@ type OpenAIUsage struct { CompletionTokens int `json:"completion_tokens"` TotalTokens int `json:"total_tokens"` // Extra timing data, disabled by default as is't not a part of OpenAI specification - TimingPromptTokens int `json:"timing_prompt_tokens,omitempty"` - TimingPredictedTokens int `json:"timing_predicted_tokens,omitempty"` TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"` TimingTokenGeneration float64 `json:"timing_token_generation,omitempty"` }