diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index e688a7fa..d4845903 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -3587,6 +3587,10 @@ static void start_llama_server(server_context& ctx_server) { common_chat_templates_source(ctx_server.chat_templates.get()), common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str()); + // Reset the chat templates + // TODO: We should make this configurable by respecting the option that is already present in LocalAI for vLLM + ctx_server.chat_templates.reset(); + ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) { ctx_server.process_single_task(std::move(task)); });