Compare commits

..

No commits in common. "bad6d96a2bd48673617f49f81fee581028af9549" and "b35483742cf3d168ec12b1440bb883c6688347a5" have entirely different histories.

7 changed files with 26071 additions and 44 deletions

View file

@ -6,7 +6,7 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=6aa892ec2aa7fe0c93e87c4b970d83a942fb9454
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp

View file

@ -74,7 +74,7 @@ add_library(hw_grpc_proto
${hw_proto_srcs}
${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

View file

@ -1,12 +1,3 @@
// llama.cpp gRPC C++ backend server
//
// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
//
// This is a gRPC server for llama.cpp compatible with the LocalAI proto
// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server),
// but modified to work with gRPC
//
#include "utils.hpp"
#include "arg.h"
@ -52,9 +43,9 @@ using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::Status;
using json = nlohmann::ordered_json;
// END LocalAI
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
enum stop_type {
@ -353,6 +344,7 @@ struct server_task {
}
}
//TODO: add back json_schema and grammar support
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data.contains("grammar")) {
try {
@ -1450,7 +1442,7 @@ struct server_slot {
pos = text.find(word, from_pos);
} else {
// otherwise, partial stop
pos = string_find_partial_stop(text, word);
pos = find_partial_stop_string(word, text);
}
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
@ -2272,14 +2264,6 @@ struct server_context {
slot.has_next_token = true;
}
// if context shifting is disabled, make sure that we don't run out of context
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
slot.stop = STOP_TYPE_LIMIT;
slot.has_next_token = false;
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
}
// check the limits
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
slot.stop = STOP_TYPE_LIMIT;
@ -2980,8 +2964,7 @@ struct server_context {
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
// add generated tokens to cache
{
if (slot.params.cache_prompt) {
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
new_tokens[i - n_discard] = new_tokens[i];
@ -3026,7 +3009,10 @@ struct server_context {
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
slot.n_past += 1;
slot.cache_tokens.push_back(slot.sampled);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(slot.sampled);
}
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
@ -3198,11 +3184,6 @@ struct server_context {
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
}
} else {
// if we don't cache the prompt, we have to remove the entire KV cache
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
slot.n_past = 0;
slot.cache_tokens.clear();
}
}
@ -3236,7 +3217,7 @@ struct server_context {
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
// remove the non-common part from the cache
slot.cache_tokens.keep_first(slot.n_past);
//slot.cache_tokens.resize(slot.n_past);
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens
@ -3253,8 +3234,7 @@ struct server_context {
continue;
}
// add the image chunk to cache
{
if (slot.params.cache_prompt) {
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
slot.cache_tokens.push_back(chunk.get()); // copy
}
@ -3275,7 +3255,9 @@ struct server_context {
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
slot.cache_tokens.push_back(cur_tok);
if (slot.params.cache_prompt) {
slot.cache_tokens.push_back(cur_tok);
}
slot.n_prompt_tokens_processed++;
slot.n_past++;
@ -4270,11 +4252,9 @@ public:
body["stream"] = false;
/*
if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Pooling type 'none' is not OAI compatible. Please use a different pooling type");
}
*/
// for the shape of input/content, see tokenize_input_prompts()
json prompt = body.at("prompt");
@ -4302,7 +4282,7 @@ public:
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
// OAI-compat
task.params.oaicompat = OAICOMPAT_TYPE_NONE;
task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
tasks.push_back(std::move(task));
}

24766
backend/cpp/llama/json.hpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -9,10 +9,9 @@ done
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
cp -rfv json.hpp llama.cpp/tools/grpc-server/
cp -rfv utils.hpp llama.cpp/tools/grpc-server/
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
echo "grpc-server already added"
else

1282
backend/cpp/llama/utils.hpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("<__image__>bar"))
Expect(result).To(Equal("[img-0]bar"))
})
It("should handle messages with more images correctly", func() {
@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("<__image__><__image__>bar"))
Expect(result).To(Equal("[img-0][img-1]bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{
@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{
@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
VideosInMessage: 0,
}, "bar")
Expect(err).NotTo(HaveOccurred())
Expect(result).To(Equal("[audio-0]<__image__>bar"))
Expect(result).To(Equal("[audio-0][img-2]bar"))
})
It("should handle messages with more images correctly", func() {
result, err := TemplateMultiModal("", MultiModalOptions{