mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 02:24:59 +00:00
fix(llama.cpp): improve context shift handling (#4820)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
83202cae54
commit
9e32fda304
1 changed files with 13 additions and 5 deletions
|
@ -1155,6 +1155,14 @@ struct llama_server_context
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.n_past >= slot.n_ctx) {
|
||||||
|
slot.truncated = true;
|
||||||
|
slot.stopped_limit = true;
|
||||||
|
slot.has_next_token = false;
|
||||||
|
|
||||||
|
LOG_VERBOSE("stopped due to running out of context capacity", {});
|
||||||
|
}
|
||||||
|
|
||||||
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
|
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
|
||||||
{
|
{
|
||||||
slot.stopped_eos = true;
|
slot.stopped_eos = true;
|
||||||
|
@ -1627,17 +1635,17 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||||
{
|
{
|
||||||
|
// this check is redundant (for good)
|
||||||
|
// we should never get here, because generation should already stopped in process_token()
|
||||||
|
|
||||||
// START LOCALAI changes
|
// START LOCALAI changes
|
||||||
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
|
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
|
||||||
// See: https://github.com/mudler/LocalAI/issues/1333
|
// See: https://github.com/mudler/LocalAI/issues/1333
|
||||||
// Context is exhausted, release the slot
|
// Context is exhausted, release the slot
|
||||||
slot.release();
|
slot.release();
|
||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
slot.cache_tokens.clear();
|
slot.has_next_token = false;
|
||||||
slot.n_past = 0;
|
LOG_ERROR("context is exhausted, release the slot", {});
|
||||||
slot.truncated = false;
|
|
||||||
slot.has_next_token = true;
|
|
||||||
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
// END LOCALAI changes
|
// END LOCALAI changes
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue