From b087a44fa01d37e2977fa15b2d410915c4a46d14 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 15 May 2025 23:17:00 +0200 Subject: [PATCH] Add logs --- backend/cpp/llama/grpc-server.cpp | 51 +++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 8ceb1cca..e688a7fa 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -3568,6 +3568,10 @@ inline void signal_handler(int signal) { bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model static void start_llama_server(server_context& ctx_server) { + + LOG_INF("%s: starting llama server\n", __func__); + + LOG_INF("%s: waiting for model to be loaded\n", __func__); // Wait for model to be loaded first while (!loaded_model) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); @@ -3873,6 +3877,11 @@ public: llama_backend_init(); llama_numa_init(params.numa); + + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); // load the model if (!ctx_server.load_model(params)) { result->set_message("Failed loading model"); @@ -4071,27 +4080,31 @@ public: } grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) { + std::cout << "[DEBUG] Starting Predict request processing" << std::endl; json data = parse_options(true, request); + std::cout << "[DEBUG] Parsed request options" << std::endl; //Raise error if embeddings is set to true if (ctx_server.params_base.embedding) { + std::cout << "[DEBUG] Error: Embedding mode not supported in streaming" << std::endl; return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Embedding is not supported in streaming mode"); } auto completion_id = gen_chatcmplid(); + std::cout << "[DEBUG] Generated completion ID: " << completion_id << std::endl; std::unordered_set task_ids; try { std::vector tasks; const auto & prompt = data.at("prompt"); const auto type = SERVER_TASK_TYPE_COMPLETION; - // TODO: this log can become very long, put it behind a flag or think about a more compact format - //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + std::cout << "[DEBUG] Processing prompt of type: " << type << std::endl; std::vector files; const auto &images_data = data.find("image_data"); if (images_data != data.end() && images_data->is_array()) { + std::cout << "[DEBUG] Found " << images_data->size() << " images to process" << std::endl; for (const auto &img : *images_data) { const std::vector image_buffer = base64_decode(img["data"].get()); @@ -4105,11 +4118,14 @@ public: const bool has_mtmd = ctx_server.mctx != nullptr; { if (!has_mtmd && !files.empty()) { + std::cout << "[DEBUG] Error: Server does not support multimodal processing" << std::endl; throw std::runtime_error("This server does not support multimodal"); } for (auto & file : files) { + std::cout << "[DEBUG] Processing image file of size: " << file.size() << " bytes" << std::endl; mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); if (!bmp.ptr) { + std::cout << "[DEBUG] Error: Failed to load image" << std::endl; throw std::runtime_error("Failed to load image"); } // calculate bitmap hash (for KV caching) @@ -4122,10 +4138,12 @@ public: // process prompt std::vector inputs; if (!prompt.is_string()) { + std::cout << "[DEBUG] Error: Prompt must be a string" << std::endl; throw std::runtime_error("prompt must be a string"); } if (has_mtmd) { + std::cout << "[DEBUG] Processing multimodal input" << std::endl; // multimodal std::string prompt_str = prompt.get(); mtmd_input_text inp_txt = { @@ -4141,12 +4159,14 @@ public: bitmaps_c_ptr.data(), bitmaps_c_ptr.size()); if (tokenized != 0) { + std::cout << "[DEBUG] Error: Failed to tokenize multimodal prompt" << std::endl; throw std::runtime_error("Failed to tokenize prompt"); } server_tokens tmp(chunks, true); inputs.push_back(std::move(tmp)); } else { + std::cout << "[DEBUG] Processing standard text input" << std::endl; // non-multimodal version auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); for (auto & p : tokenized_prompts) { @@ -4155,6 +4175,7 @@ public: } } + std::cout << "[DEBUG] Created " << inputs.size() << " input tasks" << std::endl; tasks.reserve(inputs.size()); for (size_t i = 0; i < inputs.size(); i++) { server_task task = server_task(type); @@ -4178,31 +4199,37 @@ public: } task_ids = server_task::get_list_id(tasks); + std::cout << "[DEBUG] Created " << tasks.size() << " tasks with IDs" << std::endl; ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(std::move(tasks)); } catch (const std::exception & e) { + std::cout << "[DEBUG] Error occurred: " << e.what() << std::endl; return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, e.what()); } + std::cout << "[DEBUG] Waiting for results..." << std::endl; ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - if (results.size() == 1) { - // single result - reply->set_message(results[0]->to_json()); - } else { - // multiple results (multitask) - json arr = json::array(); - for (auto & res : results) { - arr.push_back(res->to_json()); + std::cout << "[DEBUG] Received " << results.size() << " results" << std::endl; + if (results.size() == 1) { + // single result + reply->set_message(results[0]->to_json()); + } else { + // multiple results (multitask) + json arr = json::array(); + for (auto & res : results) { + arr.push_back(res->to_json()); + } + reply->set_message(arr); } - reply->set_message(arr); - } }, [&](const json & error_data) { + std::cout << "[DEBUG] Error in results: " << error_data.value("content", "") << std::endl; reply->set_message(error_data.value("content", "")); }, [&]() { return false; }); ctx_server.queue_results.remove_waiting_task_ids(task_ids); + std::cout << "[DEBUG] Predict request completed successfully" << std::endl; return grpc::Status::OK; }