mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
Sync from server.cpp
This commit is contained in:
parent
3d397d8aab
commit
6c751d98f3
1 changed files with 14 additions and 13 deletions
|
@ -52,9 +52,9 @@ using grpc::Server;
|
||||||
using grpc::ServerBuilder;
|
using grpc::ServerBuilder;
|
||||||
using grpc::ServerContext;
|
using grpc::ServerContext;
|
||||||
using grpc::Status;
|
using grpc::Status;
|
||||||
using json = nlohmann::ordered_json;
|
|
||||||
// END LocalAI
|
// END LocalAI
|
||||||
|
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||||
|
|
||||||
enum stop_type {
|
enum stop_type {
|
||||||
|
@ -353,7 +353,6 @@ struct server_task {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: add back json_schema and grammar support
|
|
||||||
// process "json_schema" and "grammar"
|
// process "json_schema" and "grammar"
|
||||||
if (data.contains("json_schema") && !data.contains("grammar")) {
|
if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||||
try {
|
try {
|
||||||
|
@ -1451,7 +1450,7 @@ struct server_slot {
|
||||||
pos = text.find(word, from_pos);
|
pos = text.find(word, from_pos);
|
||||||
} else {
|
} else {
|
||||||
// otherwise, partial stop
|
// otherwise, partial stop
|
||||||
pos = find_partial_stop_string(word, text);
|
pos = string_find_partial_stop(text, word);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
|
||||||
|
@ -2973,7 +2972,8 @@ struct server_context {
|
||||||
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||||
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
// add generated tokens to cache
|
||||||
|
{
|
||||||
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
|
llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
|
||||||
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
|
||||||
new_tokens[i - n_discard] = new_tokens[i];
|
new_tokens[i - n_discard] = new_tokens[i];
|
||||||
|
@ -3018,10 +3018,7 @@ struct server_context {
|
||||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
slot.cache_tokens.push_back(slot.sampled);
|
||||||
if (slot.params.cache_prompt) {
|
|
||||||
slot.cache_tokens.push_back(slot.sampled);
|
|
||||||
}
|
|
||||||
|
|
||||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
||||||
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
||||||
|
@ -3193,6 +3190,11 @@ struct server_context {
|
||||||
|
|
||||||
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// if we don't cache the prompt, we have to remove the entire KV cache
|
||||||
|
llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
|
||||||
|
slot.n_past = 0;
|
||||||
|
slot.cache_tokens.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3226,7 +3228,7 @@ struct server_context {
|
||||||
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||||
|
|
||||||
// remove the non-common part from the cache
|
// remove the non-common part from the cache
|
||||||
//slot.cache_tokens.resize(slot.n_past);
|
slot.cache_tokens.keep_first(slot.n_past);
|
||||||
|
|
||||||
// check if we should process the image
|
// check if we should process the image
|
||||||
if (slot.n_past < slot.n_prompt_tokens
|
if (slot.n_past < slot.n_prompt_tokens
|
||||||
|
@ -3243,7 +3245,8 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
// add the image chunk to cache
|
||||||
|
{
|
||||||
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
|
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
|
||||||
slot.cache_tokens.push_back(chunk.get()); // copy
|
slot.cache_tokens.push_back(chunk.get()); // copy
|
||||||
}
|
}
|
||||||
|
@ -3264,9 +3267,7 @@ struct server_context {
|
||||||
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
|
||||||
|
|
||||||
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
|
||||||
if (slot.params.cache_prompt) {
|
slot.cache_tokens.push_back(cur_tok);
|
||||||
slot.cache_tokens.push_back(cur_tok);
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed++;
|
slot.n_prompt_tokens_processed++;
|
||||||
slot.n_past++;
|
slot.n_past++;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue