2025-06-13 06:15:00 +00:00
7 changed files with 26071 additions and 44 deletions
--- a/2
+++ b/2
@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=6aa892ec2aa7fe0c93e87c4b970d83a942fb9454
+CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -74,7 +74,7 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)

 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -1,12 +1,3 @@
-// llama.cpp gRPC C++ backend server
-//
-// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
-//
-// This is a gRPC server for llama.cpp compatible with the LocalAI proto
-// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server), 
-// but modified to work with gRPC
-//
-
 #include "utils.hpp"

 #include "arg.h"
@ -52,9 +43,9 @@ using grpc::Server;
 using grpc::ServerBuilder;
 using grpc::ServerContext;
 using grpc::Status;
+using json = nlohmann::ordered_json;
 // END LocalAI

-using json = nlohmann::ordered_json;
 constexpr int HTTP_POLLING_SECONDS = 1;

 enum stop_type {
@ -353,6 +344,7 @@ struct server_task {
            }
        }

+        //TODO: add back json_schema and grammar support
        // process "json_schema" and "grammar"
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
@ -1450,7 +1442,7 @@ struct server_slot {
                pos = text.find(word, from_pos);
            } else {
                // otherwise, partial stop
-                pos = string_find_partial_stop(text, word);
+                pos = find_partial_stop_string(word, text);
            }

            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@ -2272,14 +2264,6 @@ struct server_context {
            slot.has_next_token = true;
        }

-        // if context shifting is disabled, make sure that we don't run out of context
-        if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
-            slot.stop           = STOP_TYPE_LIMIT;
-            slot.has_next_token = false;
-
-            SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
-        }
-
        // check the limits
        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stop           = STOP_TYPE_LIMIT;
@ -2980,8 +2964,7 @@ struct server_context {
                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);

-                // add generated tokens to cache
-                {
+                if (slot.params.cache_prompt) {
                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
                        new_tokens[i - n_discard] = new_tokens[i];
@ -3026,7 +3009,10 @@ struct server_context {
            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);

            slot.n_past += 1;
-            slot.cache_tokens.push_back(slot.sampled);
+
+            if (slot.params.cache_prompt) {
+                slot.cache_tokens.push_back(slot.sampled);
+            }

            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@ -3198,11 +3184,6 @@ struct server_context {

                                    SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
                                }
-                            } else {
-                                // if we don't cache the prompt, we have to remove the entire KV cache
-                                llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
-                                slot.n_past = 0;
-                                slot.cache_tokens.clear();
                            }
                        }

@ -3236,7 +3217,7 @@ struct server_context {
                    SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);

                    // remove the non-common part from the cache
-                    slot.cache_tokens.keep_first(slot.n_past);
+                    //slot.cache_tokens.resize(slot.n_past);

                    // check if we should process the image
                    if (slot.n_past < slot.n_prompt_tokens
@ -3253,8 +3234,7 @@ struct server_context {
                            continue;
                        }

-                        // add the image chunk to cache
-                        {
+                        if (slot.params.cache_prompt) {
                            const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
                            slot.cache_tokens.push_back(chunk.get()); // copy
                        }
@ -3275,7 +3255,9 @@ struct server_context {
                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;

                        common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
-                        slot.cache_tokens.push_back(cur_tok);
+                        if (slot.params.cache_prompt) {
+                            slot.cache_tokens.push_back(cur_tok);
+                        }

                        slot.n_prompt_tokens_processed++;
                        slot.n_past++;
@ -4270,11 +4252,9 @@ public:

        body["stream"] = false;

-        /*
        if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Pooling type 'none' is not OAI compatible. Please use a different pooling type");
        }
-        */

        // for the shape of input/content, see tokenize_input_prompts()
        json prompt = body.at("prompt");
@ -4302,7 +4282,7 @@ public:
                task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);

                // OAI-compat
-                task.params.oaicompat = OAICOMPAT_TYPE_NONE;
+                task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;

                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -9,10 +9,9 @@ done

 cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
 cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
-cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
-cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
-cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
-
+cp -rfv json.hpp llama.cpp/tools/grpc-server/
+cp -rfv utils.hpp llama.cpp/tools/grpc-server/
+    
 if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
--- a/pkg/templates/multimodal_test.go
+++ b/pkg/templates/multimodal_test.go
@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("<__image__>bar"))
+			Expect(result).To(Equal("[img-0]bar"))
 		})

 		It("should handle messages with more images correctly", func() {
@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("<__image__><__image__>bar"))
+			Expect(result).To(Equal("[img-0][img-1]bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
+			Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0]<__image__>bar"))
+			Expect(result).To(Equal("[audio-0][img-2]bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{