From c56b6ddb1cee8b8b2a19ddeb9efdb464e1789f2e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 13 Feb 2024 21:17:21 +0100 Subject: [PATCH] fix(llama.cpp): disable infinite context shifting (#1704) Infinite context loop might as well trigger an infinite loop of context shifting if the model hallucinates and does not stop answering. This has the unpleasant effect that the predicion never terminates, which is the case especially on small models which tends to hallucinate. Workarounds https://github.com/mudler/LocalAI/issues/1333 by removing context-shifting. See also upstream issue: https://github.com/ggerganov/llama.cpp/issues/3969 --- backend/cpp/llama/grpc-server.cpp | 36 +++++++++++-------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 35ca6ea5..954e472a 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -1387,30 +1387,20 @@ struct llama_server_context { if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) { - // Shift context - const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; + // START LOCALAI changes + // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969) + // See: https://github.com/mudler/LocalAI/issues/1333 + // Context is exhausted, release the slot + slot.release(); + send_final_response(slot); + slot.cache_tokens.clear(); + slot.n_past = 0; + slot.truncated = false; + slot.has_next_token = true; + LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) - { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", params.n_keep }, - { "n_left", n_left }, - }); + continue; + // END LOCALAI changes } } }