diff --git a/Makefile b/Makefile index 0a5d030b..8da648dd 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 -CPPLLAMA_VERSION?=9350a1cf21b1492c69b20175b73a419b897d6a3a +CPPLLAMA_VERSION?=88c46cbdac05cebd936511b1d3c74112e721615f # gpt4all version GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all diff --git a/backend/cpp/llama/CMakeLists.txt b/backend/cpp/llama/CMakeLists.txt index 8299705a..031e4964 100644 --- a/backend/cpp/llama/CMakeLists.txt +++ b/backend/cpp/llama/CMakeLists.txt @@ -2,16 +2,20 @@ ## XXX: In some versions of CMake clip wasn't being built before llama. ## This is an hack for now, but it should be fixed in the future. set(TARGET myclip) -add_library(${TARGET} clip.cpp clip.h) +add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h) install(TARGETS ${TARGET} LIBRARY) -target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(myclip PUBLIC .) +target_include_directories(myclip PUBLIC ../..) +target_include_directories(myclip PUBLIC ../../common) +target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if (NOT MSVC) target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h endif() +# END CLIP hack + set(TARGET grpc-server) -# END CLIP hack set(CMAKE_CXX_STANDARD 17) cmake_minimum_required(VERSION 3.15) set(TARGET grpc-server) diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile index b050b620..d6d8ae90 100644 --- a/backend/cpp/llama/Makefile +++ b/backend/cpp/llama/Makefile @@ -45,6 +45,9 @@ llama.cpp/examples/grpc-server: ## XXX: In some versions of CMake clip wasn't being built before llama. ## This is an hack for now, but it should be fixed in the future. cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h + cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp + echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h + cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp rebuild: diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 89169eea..0066c16d 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -11,7 +11,8 @@ #include #include #include -#include "../llava/clip.h" +#include "clip.h" +#include "llava.h" #include "stb_image.h" #include "common.h" #include "json.hpp" @@ -32,6 +33,7 @@ #include #include #include +#include using grpc::Server; using grpc::ServerBuilder; @@ -51,10 +53,11 @@ struct server_params std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; - std::string chat_template = "chatml"; + std::string chat_template = ""; int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool slots_endpoint = true; }; bool server_verbose = false; @@ -173,6 +176,7 @@ struct llama_client_slot int32_t n_decoded = 0; int32_t n_remaining = -1; int32_t i_batch = -1; + int32_t n_predict = -1; int32_t num_prompt_tokens = 0; int32_t num_prompt_tokens_processed = 0; @@ -424,6 +428,7 @@ struct llama_server_context slot.id = i; slot.n_ctx = n_ctx_slot; + slot.n_predict = params.n_predict; LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); @@ -451,10 +456,6 @@ struct llama_server_context default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); - - // empty system prompt - system_prompt = ""; - system_tokens.clear(); } std::vector tokenize(const json & json_prompt, bool add_bos) const @@ -531,7 +532,7 @@ struct llama_server_context bool launch_slot_with_data(llama_client_slot* &slot, json data) { slot_params default_params; llama_sampling_params default_sparams; - + slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); @@ -555,6 +556,16 @@ struct llama_server_context slot->params.seed = json_value(data, "seed", default_params.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); + slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + + if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) { + // Might be better to reject the request with a 400 ? + LOG_WARNING("Max tokens to predict exceeds server configuration", { + {"params.n_predict", slot->params.n_predict}, + {"slot.n_predict", slot->n_predict}, + }); + slot->params.n_predict = slot->n_predict; + } // infill if (data.count("input_prefix") != 0) @@ -683,6 +694,24 @@ struct llama_server_context } } + const auto &samplers_sequence = data.find("samplers"); + if (samplers_sequence != data.end() && samplers_sequence->is_array()) + { + std::vector sampler_names; + for (const auto &sampler_name : *samplers_sequence) + { + if (sampler_name.is_string()) + { + sampler_names.emplace_back(sampler_name); + } + } + slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false); + } + else + { + slot->sparams.samplers_sequence = default_sparams.samplers_sequence; + } + if (multimodal) { const auto &images_data = data.find("image_data"); @@ -772,27 +801,30 @@ struct llama_server_context } void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - - llama_batch_clear(batch); - kv_cache_clear(); + system_tokens.clear(); - for (int i = 0; i < (int) system_tokens.size(); ++i) - { - llama_batch_add(batch, system_tokens[i], i, { 0 }, false); - } + if (!system_prompt.empty()) { + system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - if (llama_decode(ctx, batch) != 0) - { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; - } + llama_batch_clear(batch); - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) - { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + for (int i = 0; i < (int)system_tokens.size(); ++i) + { + llama_batch_add(batch, system_tokens[i], i, { 0 }, false); + } + + if (llama_decode(ctx, batch) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } + + // assign the system KV cache to all parallel sequences + for (int32_t i = 1; i < params.n_parallel; ++i) + { + llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + } } LOG_TEE("system prompt updated\n"); @@ -814,10 +846,8 @@ struct llama_server_context name_user = sys_props.value("anti_prompt", ""); name_assistant = sys_props.value("assistant_name", ""); - if (slots.size() > 0) - { - notify_system_prompt_changed(); - } + + notify_system_prompt_changed(); } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, @@ -975,44 +1005,12 @@ struct llama_server_context { continue; } - clip_image_f32_batch img_res_v; - img_res_v.size = 0; - img_res_v.data = nullptr; - if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v)) - { - LOG_TEE("Error processing the given image"); - clip_free(clp_ctx); - clip_image_f32_batch_free(img_res_v); - return false; - } - if (img_res_v.size == 0) - { + + if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { LOG_TEE("Error processing the given image"); return false; } - // note: assumes only one image was returned by clip_image_preprocess - clip_image_f32 * img_res = img_res_v.data; - - img.image_tokens = clip_n_patches(clp_ctx); - img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx)); - if (!img.image_embedding) - { - LOG_TEE("Unable to allocate memory for image embeddings\n"); - clip_image_f32_batch_free(img_res_v); - clip_free(clp_ctx); - return false; - } - LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) - { - LOG_TEE("Unable to encode image\n"); - clip_image_f32_batch_free(img_res_v); - return false; - } - - clip_image_f32_batch_free(img_res_v); - img.request_encode_image = false; } @@ -1036,8 +1034,15 @@ struct llama_server_context const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); + std::vector samplers_sequence; + for (const auto &sampler_type : slot.sparams.samplers_sequence) + { + samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type)); + } + return json { {"n_ctx", slot.n_ctx}, + {"n_predict", slot.n_predict}, {"model", params.model_alias}, {"seed", slot.params.seed}, {"temperature", slot.sparams.temp}, @@ -1065,7 +1070,9 @@ struct llama_server_context {"stream", slot.params.stream}, {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, + {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, + {"samplers", samplers_sequence} }; } @@ -1877,6 +1884,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } +std::function shutdown_handler; +inline void signal_handler(int signal) { shutdown_handler(signal); } + ///////////////////////////////// //////////////////////////////// //////// LOCALAI code starts below here @@ -2147,7 +2157,8 @@ public: gpt_params params; params_parse(request, params); - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // load the model if (!llama.load_model(params))