mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-19 18:15:00 +00:00
Compare commits
9 commits
b35483742c
...
bad6d96a2b
Author | SHA1 | Date | |
---|---|---|---|
![]() |
bad6d96a2b | ||
![]() |
f30a790052 | ||
![]() |
67786c9c41 | ||
![]() |
b9cf7c31b9 | ||
![]() |
d2a5905500 | ||
![]() |
6c751d98f3 | ||
![]() |
3d397d8aab | ||
![]() |
1f536c5ed7 | ||
![]() |
c15e91a65b |
7 changed files with 44 additions and 26071 deletions
2
Makefile
2
Makefile
|
@ -6,7 +6,7 @@ BINARY_NAME=local-ai
|
|||
DETECT_LIBS?=true
|
||||
|
||||
# llama.cpp versions
|
||||
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
|
||||
CPPLLAMA_VERSION?=6aa892ec2aa7fe0c93e87c4b970d83a942fb9454
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
|
|
|
@ -74,7 +74,7 @@ add_library(hw_grpc_proto
|
|||
${hw_proto_srcs}
|
||||
${hw_proto_hdrs} )
|
||||
|
||||
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
|
||||
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
|
|
|
@ -1,3 +1,12 @@
|
|||
// llama.cpp gRPC C++ backend server
|
||||
//
|
||||
// Ettore Di Giacinto <mudler@localai.io> and llama.cpp authors
|
||||
//
|
||||
// This is a gRPC server for llama.cpp compatible with the LocalAI proto
|
||||
// Note: this is a re-adaptation of the original llama.cpp example/server.cpp for HTTP (https://github.com/ggerganov/llama.cpp/tree/master/examples/server),
|
||||
// but modified to work with gRPC
|
||||
//
|
||||
|
||||
#include "utils.hpp"
|
||||
|
||||
#include "arg.h"
|
||||
|
@ -43,9 +52,9 @@ using grpc::Server;
|
|||
using grpc::ServerBuilder;
|
||||
using grpc::ServerContext;
|
||||
using grpc::Status;
|
||||
using json = nlohmann::ordered_json;
|
||||
// END LocalAI
|
||||
|
||||
using json = nlohmann::ordered_json;
|
||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||
|
||||
enum stop_type {
|
||||
|
@ -344,7 +353,6 @@ struct server_task {
|
|||
}
|
||||
}
|
||||
|
||||
//TODO: add back json_schema and grammar support
|
||||
// process "json_schema" and "grammar"
|
||||
if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||
try {
|
||||
|
@ -1442,7 +1450,7 @@ struct server_slot {
|
|||
pos = text.find(word, from_pos);
|
||||
} else {
|
||||
// otherwise, partial stop
|
||||
pos = find_partial_stop_string(word, text);
|
||||
pos = string_find_partial_stop(text, word);
|
||||
}
|
||||
|
||||
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
||||
|
@ -2264,6 +2272,14 @@ struct server_context {
|
|||
slot.has_next_token = true;
|
||||
}
|
||||
|
||||
// if context shifting is disabled, make sure that we don't run out of context
|
||||
if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
slot.has_next_token = false;
|
||||
|
||||
SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
|
||||
}
|
||||
|
||||
// check the limits
|
||||
if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
|
||||
slot.stop = STOP_TYPE_LIMIT;
|
||||
|
@ -2964,7 +2980,8 @@ struct server_context {
|
|||
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
// add generated tokens to cache
|
||||
{
|
||||
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
|
||||
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
||||
new_tokens[i - n_discard] = new_tokens[i];
|
||||
|
@ -3009,10 +3026,7 @@ struct server_context {
|
|||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
||||
|
||||
slot.n_past += 1;
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
slot.cache_tokens.push_back(slot.sampled);
|
||||
}
|
||||
slot.cache_tokens.push_back(slot.sampled);
|
||||
|
||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
||||
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
||||
|
@ -3184,6 +3198,11 @@ struct server_context {
|
|||
|
||||
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
||||
}
|
||||
} else {
|
||||
// if we don't cache the prompt, we have to remove the entire KV cache
|
||||
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
||||
slot.n_past = 0;
|
||||
slot.cache_tokens.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3217,7 +3236,7 @@ struct server_context {
|
|||
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||
|
||||
// remove the non-common part from the cache
|
||||
//slot.cache_tokens.resize(slot.n_past);
|
||||
slot.cache_tokens.keep_first(slot.n_past);
|
||||
|
||||
// check if we should process the image
|
||||
if (slot.n_past < slot.n_prompt_tokens
|
||||
|
@ -3234,7 +3253,8 @@ struct server_context {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
// add the image chunk to cache
|
||||
{
|
||||
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
|
||||
slot.cache_tokens.push_back(chunk.get()); // copy
|
||||
}
|
||||
|
@ -3255,9 +3275,7 @@ struct server_context {
|
|||
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
||||
|
||||
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
||||
if (slot.params.cache_prompt) {
|
||||
slot.cache_tokens.push_back(cur_tok);
|
||||
}
|
||||
slot.cache_tokens.push_back(cur_tok);
|
||||
|
||||
slot.n_prompt_tokens_processed++;
|
||||
slot.n_past++;
|
||||
|
@ -4252,9 +4270,11 @@ public:
|
|||
|
||||
body["stream"] = false;
|
||||
|
||||
/*
|
||||
if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
|
||||
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "Pooling type 'none' is not OAI compatible. Please use a different pooling type");
|
||||
}
|
||||
*/
|
||||
|
||||
// for the shape of input/content, see tokenize_input_prompts()
|
||||
json prompt = body.at("prompt");
|
||||
|
@ -4282,7 +4302,7 @@ public:
|
|||
task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr);
|
||||
|
||||
// OAI-compat
|
||||
task.params.oaicompat = OAICOMPAT_TYPE_EMBEDDING;
|
||||
task.params.oaicompat = OAICOMPAT_TYPE_NONE;
|
||||
|
||||
tasks.push_back(std::move(task));
|
||||
}
|
||||
|
|
24766
backend/cpp/llama/json.hpp
vendored
24766
backend/cpp/llama/json.hpp
vendored
File diff suppressed because it is too large
Load diff
|
@ -9,9 +9,10 @@ done
|
|||
|
||||
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv json.hpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv utils.hpp llama.cpp/tools/grpc-server/
|
||||
|
||||
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
|
||||
|
||||
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
|
||||
echo "grpc-server already added"
|
||||
else
|
||||
|
|
1282
backend/cpp/llama/utils.hpp
vendored
1282
backend/cpp/llama/utils.hpp
vendored
File diff suppressed because it is too large
Load diff
|
@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
|
|||
VideosInMessage: 0,
|
||||
}, "bar")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(result).To(Equal("[img-0]bar"))
|
||||
Expect(result).To(Equal("<__image__>bar"))
|
||||
})
|
||||
|
||||
It("should handle messages with more images correctly", func() {
|
||||
|
@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
|
|||
VideosInMessage: 0,
|
||||
}, "bar")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(result).To(Equal("[img-0][img-1]bar"))
|
||||
Expect(result).To(Equal("<__image__><__image__>bar"))
|
||||
})
|
||||
It("should handle messages with more images correctly", func() {
|
||||
result, err := TemplateMultiModal("", MultiModalOptions{
|
||||
|
@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
|
|||
VideosInMessage: 0,
|
||||
}, "bar")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
|
||||
Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
|
||||
})
|
||||
It("should handle messages with more images correctly", func() {
|
||||
result, err := TemplateMultiModal("", MultiModalOptions{
|
||||
|
@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
|
|||
VideosInMessage: 0,
|
||||
}, "bar")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(result).To(Equal("[audio-0][img-2]bar"))
|
||||
Expect(result).To(Equal("[audio-0]<__image__>bar"))
|
||||
})
|
||||
It("should handle messages with more images correctly", func() {
|
||||
result, err := TemplateMultiModal("", MultiModalOptions{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue