sync llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
fix: add httplib
2025-05-19 18:15:00 +00:00 · 2025-05-16 22:11:22 +02:00 · 2025-05-16 22:10:10 +02:00 · 2025-05-16 22:01:38 +02:00 · 2025-05-16 21:51:59 +02:00 · 2025-05-16 21:47:59 +02:00
7 changed files with 44 additions and 26071 deletions
--- a/2
+++ b/2
@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
+CPPLLAMA_VERSION?=6aa892ec2aa7fe0c93e87c4b970d83a942fb9454

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -74,7 +74,7 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)

 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -1,3 +1,12 @@
+// llama.cpp gRPC C++ backend server
+//
+// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
+//
+// This is a gRPC server for llama.cpp compatible with the LocalAI proto
+// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server), 
+// but modified to work with gRPC
+//
+
 #include "utils.hpp"

 #include "arg.h"
@ -43,9 +52,9 @@ using grpc::Server;
 using grpc::ServerBuilder;
 using grpc::ServerContext;
 using grpc::Status;
-using json = nlohmann::ordered_json;
 // END LocalAI

+using json = nlohmann::ordered_json;
 constexpr int HTTP_POLLING_SECONDS = 1;

 enum stop_type {
@ -344,7 +353,6 @@ struct server_task {
            }
        }

-        //TODO: add back json_schema and grammar support
        // process "json_schema" and "grammar"
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
@ -1442,7 +1450,7 @@ struct server_slot {
                pos = text.find(word, from_pos);
            } else {
                // otherwise, partial stop
-                pos = find_partial_stop_string(word, text);
+                pos = string_find_partial_stop(text, word);
            }

            if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@ -2264,6 +2272,14 @@ struct server_context {
            slot.has_next_token = true;
        }

+        // if context shifting is disabled, make sure that we don't run out of context
+        if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
+            slot.stop           = STOP_TYPE_LIMIT;
+            slot.has_next_token = false;
+
+            SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
+        }
+
        // check the limits
        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stop           = STOP_TYPE_LIMIT;
@ -2964,7 +2980,8 @@ struct server_context {
                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);

-                if (slot.params.cache_prompt) {
+                // add generated tokens to cache
+                {
                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
                    for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
                        new_tokens[i - n_discard] = new_tokens[i];
@ -3009,10 +3026,7 @@ struct server_context {
            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);

            slot.n_past += 1;
-
-            if (slot.params.cache_prompt) {
-                slot.cache_tokens.push_back(slot.sampled);
-            }
+            slot.cache_tokens.push_back(slot.sampled);

            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
                    slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@ -3184,6 +3198,11 @@ struct server_context {

                                    SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
                                }
+                            } else {
+                                // if we don't cache the prompt, we have to remove the entire KV cache
+                                llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
+                                slot.n_past = 0;
+                                slot.cache_tokens.clear();
                            }
                        }

@ -3217,7 +3236,7 @@ struct server_context {
                    SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);

                    // remove the non-common part from the cache
-                    //slot.cache_tokens.resize(slot.n_past);
+                    slot.cache_tokens.keep_first(slot.n_past);

                    // check if we should process the image
                    if (slot.n_past < slot.n_prompt_tokens
@ -3234,7 +3253,8 @@ struct server_context {
                            continue;
                        }

-                        if (slot.params.cache_prompt) {
+                        // add the image chunk to cache
+                        {
                            const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
                            slot.cache_tokens.push_back(chunk.get()); // copy
                        }
@ -3255,9 +3275,7 @@ struct server_context {
                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;

                        common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
-                        if (slot.params.cache_prompt) {
-                            slot.cache_tokens.push_back(cur_tok);
-                        }
+                        slot.cache_tokens.push_back(cur_tok);

                        slot.n_prompt_tokens_processed++;
                        slot.n_past++;
@ -4252,9 +4270,11 @@ public:

        body["stream"] = false;

+        /*
        if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Pooling type 'none' is not OAI compatible. Please use a different pooling type");
        }
+        */

        // for the shape of input/content, see tokenize_input_prompts()
        json prompt = body.at("prompt");
@ -4282,7 +4302,7 @@ public:
                task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);

                // OAI-compat
-                task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
+                task.params.oaicompat = OAICOMPAT_TYPE_NONE;

                tasks.push_back(std::move(task));
            }
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -9,9 +9,10 @@ done

 cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
 cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
-cp -rfv json.hpp llama.cpp/tools/grpc-server/
-cp -rfv utils.hpp llama.cpp/tools/grpc-server/
-    
+cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
+
 if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
--- a/pkg/templates/multimodal_test.go
+++ b/pkg/templates/multimodal_test.go
@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[img-0]bar"))
+			Expect(result).To(Equal("<__image__>bar"))
 		})

 		It("should handle messages with more images correctly", func() {
@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[img-0][img-1]bar"))
+			Expect(result).To(Equal("<__image__><__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
+			Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0][img-2]bar"))
+			Expect(result).To(Equal("[audio-0]<__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
Author	SHA1	Message	Date
Ettore Di Giacinto	bad6d96a2b	sync llama.cpp Some checks are pending Security Scan / tests (push) Waiting to run Details Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 22:11:22 +02:00
Ettore Di Giacinto	f30a790052	fix: add httplib	2025-05-16 22:10:10 +02:00
Ettore Di Giacinto	67786c9c41	fix: copy json.hpp from the correct location Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 22:01:38 +02:00
Ettore Di Giacinto	b9cf7c31b9	Sync with upstream Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 21:51:59 +02:00
Ettore Di Giacinto	d2a5905500	Use utils and json directly from llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 21:47:59 +02:00
Ettore Di Giacinto	6c751d98f3	Sync from server.cpp	2025-05-16 21:47:08 +02:00
Ettore Di Giacinto	3d397d8aab	embedding: do not use oai type Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 21:35:57 +02:00
Ettore Di Giacinto	1f536c5ed7	Keep header	2025-05-16 20:08:26 +02:00
Ettore Di Giacinto	c15e91a65b	Adapt tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-05-16 19:06:45 +02:00