feat: add machine tag and inference timings (#4577)

* Add machine tag option, add extraUsage option, grpc-server -> proto -> endpoint extraUsage data is broken for now Signed-off-by: mintyleaf <mintyleafdev@gmail.com> * remove redurant timing fields, fix not working timings output Signed-off-by: mintyleaf <mintyleafdev@gmail.com> * use middleware for Machine-Tag only if tag is specified Signed-off-by: mintyleaf <mintyleafdev@gmail.com> --------- Signed-off-by: mintyleaf <mintyleafdev@gmail.com>
2025-05-20 02:24:59 +00:00 · 2025-01-17 20:05:58 +04:00 · 2025-01-17 20:05:58 +04:00 · 96f8ec0402
commit 96f8ec0402
parent 8027fdf1c7
15 changed files with 137 additions and 48 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -159,6 +159,8 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
+  double timing_prompt_processing = 4;
+  double timing_token_generation = 5;
 }

 message ModelOptions {
@ -348,4 +350,4 @@ message StatusResponse {
 message Message {
  string role = 1;
  string content = 2;
-}
+}
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -2408,6 +2408,13 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);

+                if (result.result_json.contains("timings")) {
+                    double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
+                    reply.set_timing_prompt_processing(timing_prompt_processing);
+                    double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
+                    reply.set_timing_token_generation(timing_token_generation);
+                }
+                
                // Log Request Correlation Id
                LOG_VERBOSE("correlation:", {
                    { "id", data["correlation_id"] }
@ -2448,6 +2455,13 @@ public:
            reply->set_prompt_tokens(tokens_evaluated);
            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
+
+            if (result.result_json.contains("timings")) {
+                double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
+                reply->set_timing_prompt_processing(timing_prompt_processing);
+                double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
+                reply->set_timing_token_generation(timing_token_generation);
+            }
        }
        else
        {