diff --git a/Makefile b/Makefile index faa500ab..2d17bda7 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 -CPPLLAMA_VERSION?=39d8bc71edcb8b6f99d46fa4216af7a15232e218 +CPPLLAMA_VERSION?=e39106c0554cbd0e9310e08fb3b2a577ea4b6273 # gpt4all version GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 749a0ace..04c6586c 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -40,12 +41,15 @@ using backend::HealthMessage; ///// LLAMA.CPP server code below + #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" + using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; + std::string api_key; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -89,7 +93,7 @@ static inline bool is_base64(uint8_t c) return (isalnum(c) || (c == '+') || (c == '/')); } -static std::vector base64_decode(std::string const &encoded_string) +static std::vector base64_decode(const std::string & encoded_string) { int i = 0; int j = 0; @@ -216,10 +220,10 @@ struct slot_image int32_t id; bool request_encode_image = false; - float* image_embedding = nullptr; + float * image_embedding = nullptr; int32_t image_tokens = 0; - clip_image_u8 img_data; + clip_image_u8 * img_data; std::string prefix_prompt; // before of this image }; @@ -441,15 +445,16 @@ struct llama_client_slot generated_token_probs.clear(); - for (slot_image &img : images) + for (slot_image & img : images) { free(img.image_embedding); - delete[] img.img_data.data; + if (img.img_data) { + clip_image_u8_free(img.img_data); + } img.prefix_prompt = ""; } images.clear(); - // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? } bool has_budget(gpt_params &global_params) { @@ -550,7 +555,9 @@ struct llama_server_context std::vector queue_results; std::vector queue_multitasks; std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks + std::condition_variable condition_tasks; std::mutex mutex_results; + std::condition_variable condition_results; ~llama_server_context() { @@ -769,6 +776,42 @@ struct llama_server_context slot->prompt = ""; } + slot->sparams.penalty_prompt_tokens.clear(); + slot->sparams.use_penalty_prompt_tokens = false; + const auto &penalty_prompt = data.find("penalty_prompt"); + if (penalty_prompt != data.end()) + { + if (penalty_prompt->is_string()) + { + const auto penalty_prompt_string = penalty_prompt->get(); + auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false); + slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); + if (slot->params.n_predict > 0) + { + slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict); + } + slot->sparams.use_penalty_prompt_tokens = true; + } + else if (penalty_prompt->is_array()) + { + const auto n_tokens = penalty_prompt->size(); + slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict)); + const int n_vocab = llama_n_vocab(model); + for (const auto &penalty_token : *penalty_prompt) + { + if (penalty_token.is_number_integer()) + { + const auto tok = penalty_token.get(); + if (tok >= 0 && tok < n_vocab) + { + slot->sparams.penalty_prompt_tokens.push_back(tok); + } + } + } + slot->sparams.use_penalty_prompt_tokens = true; + } + } + slot->sparams.logit_bias.clear(); if (json_value(data, "ignore_eos", false)) @@ -821,24 +864,17 @@ struct llama_server_context { for (const auto &img : *images_data) { - std::string data_b64 = img["data"].get(); + const std::vector image_buffer = base64_decode(img["data"].get()); + slot_image img_sl; img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); - int width, height, channels; - std::vector image_buffer = base64_decode(data_b64); - data_b64.clear(); - auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3); - if (!data) { + img_sl.img_data = clip_image_u8_init(); + if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) + { LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); return false; } - LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height); - img_sl.img_data.nx = width; - img_sl.img_data.ny = height; - img_sl.img_data.size = width * height * 3; - img_sl.img_data.data = new uint8_t[width * height * 3](); - memcpy(img_sl.img_data.data, data, width * height * 3); - stbi_image_free(data); + LOG_TEE("slot %i - loaded image\n", slot->id); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -893,6 +929,7 @@ struct llama_server_context llama_sampling_free(slot->ctx_sampling); } slot->ctx_sampling = llama_sampling_init(slot->sparams); + llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -1000,6 +1037,12 @@ struct llama_server_context slot.generated_text += token_str; slot.has_next_token = true; + if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) + { + // we can change penalty_prompt_tokens because it is always created from scratch each request + slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); + } + // check if there is incomplete UTF-8 character at the end bool incomplete = false; for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) @@ -1106,8 +1149,8 @@ struct llama_server_context { continue; } - clip_image_f32 img_res; - if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true)) + clip_image_f32 * img_res = clip_image_f32_init(); + if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true)) { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); @@ -1122,11 +1165,12 @@ struct llama_server_context return false; } LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding)) + if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) { LOG_TEE("Unable to encode image\n"); return false; } + clip_image_f32_free(img_res); img.request_encode_image = false; } @@ -1135,7 +1179,7 @@ struct llama_server_context void send_error(task_server& task, std::string error) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1143,6 +1187,7 @@ struct llama_server_context res.error = true; res.result_json = { { "content", error } }; queue_results.push_back(res); + condition_results.notify_all(); } void add_multi_task(int id, std::vector& sub_ids) @@ -1152,6 +1197,7 @@ struct llama_server_context multi.id = id; std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end())); queue_multitasks.push_back(multi); + condition_tasks.notify_one(); } void update_multi_task(int multitask_id, int subtask_id, task_result& result) @@ -1163,6 +1209,7 @@ struct llama_server_context { multitask.subtasks_remaining.erase(subtask_id); multitask.results.push_back(result); + condition_tasks.notify_one(); } } } @@ -1181,7 +1228,7 @@ struct llama_server_context {"n_ctx", slot.n_ctx}, {"model", params.model_alias}, {"seed", slot.params.seed}, - {"temp", slot.sparams.temp}, + {"temperature", slot.sparams.temp}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, @@ -1191,6 +1238,8 @@ struct llama_server_context {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, + {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, + {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1208,7 +1257,7 @@ struct llama_server_context void send_partial_response(llama_client_slot &slot, completion_token_output tkn) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1244,11 +1293,12 @@ struct llama_server_context } queue_results.push_back(res); + condition_results.notify_all(); } void send_final_response(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1304,11 +1354,12 @@ struct llama_server_context } queue_results.push_back(res); + condition_results.notify_all(); } void send_embedding(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; res.multitask_id = slot.multitask_id; @@ -1336,6 +1387,7 @@ struct llama_server_context }; } queue_results.push_back(res); + condition_results.notify_all(); } int request_completion(json data, bool infill, bool embedding, int multitask_id) @@ -1359,6 +1411,7 @@ struct llama_server_context // otherwise, it's a single-prompt task, we actually queue it queue_tasks.push_back(task); + condition_tasks.notify_one(); return task.id; } @@ -1366,13 +1419,10 @@ struct llama_server_context { while (true) { - std::this_thread::sleep_for(std::chrono::microseconds(5)); - std::lock_guard lock(mutex_results); - - if (queue_results.empty()) - { - continue; - } + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&]{ + return !queue_results.empty(); + }); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -1468,12 +1518,13 @@ struct llama_server_context void request_cancel(int task_id) { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; task.type = CANCEL_TASK; task.target_id = task_id; queue_tasks.push_back(task); + condition_tasks.notify_one(); } int split_multiprompt_task(task_server& multiprompt_task) @@ -1499,7 +1550,7 @@ struct llama_server_context void process_tasks() { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); while (!queue_tasks.empty()) { task_server task = queue_tasks.front(); @@ -1571,6 +1622,7 @@ struct llama_server_context std::lock_guard lock(mutex_results); queue_results.push_back(aggregate_result); + condition_results.notify_all(); queue_iterator = queue_multitasks.erase(queue_iterator); } @@ -1601,8 +1653,10 @@ struct llama_server_context LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); kv_cache_clear(); } - // avoid 100% usage of cpu all time - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::unique_lock lock(mutex_tasks); + condition_tasks.wait(lock, [&]{ + return !queue_tasks.empty(); + }); } for (llama_client_slot &slot : slots) @@ -1962,28 +2016,35 @@ json oaicompat_completion_params_parse( llama_params["__oaicompat"] = true; // Map OpenAI parameters to llama.cpp parameters + // + // For parameters that are defined by the OpenAI documentation (e.g. + // temperature), we explicitly specify OpenAI's intended default; we + // need to do that because sometimes OpenAI disagrees with llama.cpp + // + // https://platform.openai.com/docs/api-reference/chat/create + llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("uknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); - llama_params["temperature"] = json_value(body, "temperature", 0.8); - llama_params["top_k"] = json_value(body, "top_k", 40); - llama_params["top_p"] = json_value(body, "top_p", 0.95); + llama_params["temperature"] = json_value(body, "temperature", 0.0); + llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); + llama_params["top_p"] = json_value(body, "top_p", 1.0); llama_params["n_predict"] = json_value(body, "max_tokens", -1); llama_params["logit_bias"] = json_value(body, "logit_bias",json::object()); llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); - llama_params["seed"] = json_value(body, "seed", 0); + llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); llama_params["stream"] = json_value(body, "stream", false); - llama_params["mirostat"] = json_value(body, "mirostat", false); - llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0); - llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0); - llama_params["penalize_nl"] = json_value(body, "penalize_nl", false); - llama_params["typical_p"] = json_value(body, "typical_p", 0.0); - llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0); + llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); + llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); + llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); + llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); - llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0); + llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); - if (llama_params.count("grammar") != 0) { + if (body.count("grammar") != 0) { llama_params["grammar"] = json_value(body, "grammar", json::object()); }