Compare commits

...

9 commits

Author SHA1 Message Date
Ettore Di Giacinto
bad6d96a2b sync llama.cpp
Some checks are pending
Security Scan / tests (push) Waiting to run
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 22:11:22 +02:00
Ettore Di Giacinto
f30a790052 fix: add httplib 2025-05-16 22:10:10 +02:00
Ettore Di Giacinto
67786c9c41 fix: copy json.hpp from the correct location
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 22:01:38 +02:00
Ettore Di Giacinto
b9cf7c31b9 Sync with upstream
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 21:51:59 +02:00
Ettore Di Giacinto
d2a5905500 Use utils and json directly from llama.cpp
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 21:47:59 +02:00
Ettore Di Giacinto
6c751d98f3 Sync from server.cpp 2025-05-16 21:47:08 +02:00
Ettore Di Giacinto
3d397d8aab embedding: do not use oai type
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 21:35:57 +02:00
Ettore Di Giacinto
1f536c5ed7 Keep header 2025-05-16 20:08:26 +02:00
Ettore Di Giacinto
c15e91a65b Adapt tests
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-16 19:06:45 +02:00
7 changed files with 44 additions and 26071 deletions

View file

@ -6,7 +6,7 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
CPPLLAMA_VERSION?=6aa892ec2aa7fe0c93e87c4b970d83a942fb9454
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp

View file

@ -74,7 +74,7 @@ add_library(hw_grpc_proto
${hw_proto_srcs}
${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View file

@ -1,3 +1,12 @@
// llama.cpp gRPC C++ backend server
//
// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
//
// This is a gRPC server for llama.cpp compatible with the LocalAI proto
// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server),
// but modified to work with gRPC
//
#include "utils.hpp"
#include "arg.h"
@ -43,9 +52,9 @@ using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::Status;
using json = nlohmann::ordered_json;
// END LocalAI
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
enum stop_type {
@ -344,7 +353,6 @@ struct server_task {
}
}
//TODO: add back json_schema and grammar support
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data.contains("grammar")) {
try {
@ -1442,7 +1450,7 @@ struct server_slot {
pos = text.find(word, from_pos);
} else {
// otherwise, partial stop
pos = find_partial_stop_string(word, text);
pos = string_find_partial_stop(text, word);
}
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@ -2264,6 +2272,14 @@ struct server_context {
slot.has_next_token = true;
}
// if context shifting is disabled, make sure that we don't run out of context
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
}
// check the limits
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
slot.stop = STOP_TYPE_LIMIT;
@ -2964,7 +2980,8 @@ struct server_context {
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
if (slot.params.cache_prompt) {
// add generated tokens to cache
{
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
new_tokens[i - n_discard] = new_tokens[i];
@ -3009,10 +3026,7 @@ struct server_context {
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
slot.n_past += 1;
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(slot.sampled);
}
slot.cache_tokens.push_back(slot.sampled);
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@ -3184,6 +3198,11 @@ struct server_context {
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
}
} else {
// if we don't cache the prompt, we have to remove the entire KV cache
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
slot.n_past = 0;
slot.cache_tokens.clear();
}
}
@ -3217,7 +3236,7 @@ struct server_context {
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
// remove the non-common part from the cache
//slot.cache_tokens.resize(slot.n_past);
slot.cache_tokens.keep_first(slot.n_past);
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens
@ -3234,7 +3253,8 @@ struct server_context {
continue;
}
if (slot.params.cache_prompt) {
// add the image chunk to cache
{
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
slot.cache_tokens.push_back(chunk.get()); // copy
}
@ -3255,9 +3275,7 @@ struct server_context {
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(cur_tok);
}
slot.cache_tokens.push_back(cur_tok);
slot.n_prompt_tokens_processed++;
slot.n_past++;
@ -4252,9 +4270,11 @@ public:
body["stream"] = false;
/*
if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Pooling type 'none' is not OAI compatible. Please use a different pooling type");
}
*/
// for the shape of input/content, see tokenize_input_prompts()
json prompt = body.at("prompt");
@ -4282,7 +4302,7 @@ public:
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
// OAI-compat
task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
task.params.oaicompat = OAICOMPAT_TYPE_NONE;
tasks.push_back(std::move(task));
}

File diff suppressed because it is too large Load diff

View file

@ -9,9 +9,10 @@ done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv json.hpp llama.cpp/tools/grpc-server/
cp -rfv utils.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
echo "grpc-server already added"
else

File diff suppressed because it is too large Load diff

View file

@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[img-0]bar"))
Expect(result).To(Equal("<__image__>bar"))
})
It("should handle messages with more images correctly", func() {
@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[img-0][img-1]bar"))
Expect(result).To(Equal("<__image__><__image__>bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{
@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{
@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[audio-0][img-2]bar"))
Expect(result).To(Equal("[audio-0]<__image__>bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{