From 7b75e9de2d54ea2dcb522c91f3c615826239e074 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:34:35 +0100
Subject: [PATCH 01/89] fix(rwkv model): add stoptoken (#4283)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/rwkv.yaml               | 1 +
 tests/models_fixtures/rwkv.yaml | 1 +
 2 files changed, 2 insertions(+)
diff --git a/gallery/rwkv.yaml b/gallery/rwkv.yaml
index 41dfcfad..68693799 100644
--- a/gallery/rwkv.yaml
+++ b/gallery/rwkv.yaml
@@ -16,6 +16,7 @@ config_file: |
 
     stopwords:
     - 'Assistant:'
+    - '<s>'
 
     template:
       chat: "{{.Input}}\nAssistant: "
diff --git a/tests/models_fixtures/rwkv.yaml b/tests/models_fixtures/rwkv.yaml
index bf54394f..f66cfe21 100644
--- a/tests/models_fixtures/rwkv.yaml
+++ b/tests/models_fixtures/rwkv.yaml
@@ -14,6 +14,7 @@ roles:
 
 stopwords:
 - 'Assistant:'
+- '<s>'
 
 template:
   chat: |

From f4547fcf8a98c35c11f25062322913c5a444d76f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:34:44 +0100
Subject: [PATCH 02/89] chore(model gallery): add qwq-32b-preview (#4284)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b4b73e4b..b52511bf 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1631,6 +1631,25 @@
     - filename: EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
       sha256: 03ea0ecac3ee24a332ca43cf925b669c58714b9754be0f4bc232bd996681ef4b
       uri: huggingface://bartowski/EVA-Qwen2.5-72B-v0.2-GGUF/EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwq-32b-preview"
+  urls:
+    - https://huggingface.co/Qwen/QwQ-32B-Preview
+    - https://huggingface.co/bartowski/QwQ-32B-Preview-GGUF
+  description: |
+    QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. As a preview release, it demonstrates promising analytical abilities while having several important limitations:
+
+    Language Mixing and Code-Switching: The model may mix languages or switch between them unexpectedly, affecting response clarity.
+    Recursive Reasoning Loops: The model may enter circular reasoning patterns, leading to lengthy responses without a conclusive answer.
+    Safety and Ethical Considerations: The model requires enhanced safety measures to ensure reliable and secure performance, and users should exercise caution when deploying it.
+    Performance and Benchmark Limitations: The model excels in math and coding but has room for improvement in other areas, such as common sense reasoning and nuanced language understanding.
+  overrides:
+    parameters:
+      model: QwQ-32B-Preview-Q4_K_M.gguf
+  files:
+    - filename: QwQ-32B-Preview-Q4_K_M.gguf
+      sha256: c499801e682e2379528090c50e106837ca1d69dc3bf3ff3a9af830a0eb49cdf6
+      uri: huggingface://bartowski/QwQ-32B-Preview-GGUF/QwQ-32B-Preview-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From e001fada6caedb30111e8dd008c3c734f5fb8a30 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 28 Nov 2024 09:35:38 +0100
Subject: [PATCH 03/89] chore(model gallery): add
 llama-smoltalk-3.2-1b-instruct (#4285)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b52511bf..b6de0d6a 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -704,6 +704,31 @@
     - filename: Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
       uri: huggingface://QuantFactory/Llama-Sentient-3.2-3B-Instruct-GGUF/Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
       sha256: 3f855ce0522bfdc39fc826162ba6d89f15cc3740c5207da10e70baa3348b7812
+- !!merge <<: *llama32
+  name: "llama-smoltalk-3.2-1b-instruct"
+  urls:
+    - https://huggingface.co/prithivMLmods/Llama-SmolTalk-3.2-1B-Instruct
+    - https://huggingface.co/mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF
+  description: |
+    The Llama-SmolTalk-3.2-1B-Instruct model is a lightweight, instruction-tuned model designed for efficient text generation and conversational AI tasks. With a 1B parameter architecture, this model strikes a balance between performance and resource efficiency, making it ideal for applications requiring concise, contextually relevant outputs. The model has been fine-tuned to deliver robust instruction-following capabilities, catering to both structured and open-ended queries.
+    Key Features:
+
+        Instruction-Tuned Performance: Optimized to understand and execute user-provided instructions across diverse domains.
+        Lightweight Architecture: With just 1 billion parameters, the model provides efficient computation and storage without compromising output quality.
+        Versatile Use Cases: Suitable for tasks like content generation, conversational interfaces, and basic problem-solving.
+
+    Intended Applications:
+
+        Conversational AI: Engage users with dynamic and contextually aware dialogue.
+        Content Generation: Produce summaries, explanations, or other creative text outputs efficiently.
+        Instruction Execution: Follow user commands to generate precise and relevant responses.
+  overrides:
+    parameters:
+      model: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+      sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e
+      uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
 - &qwen25
   ## Qwen2.5
   name: "qwen2.5-14b-instruct"

From 0d6c3a7d57101428aec4100d0f7bca765ee684a7 Mon Sep 17 00:00:00 2001
From: mintyleaf <mintyleafdev@gmail.com>
Date: Thu, 28 Nov 2024 17:47:56 +0400
Subject: [PATCH 04/89] feat: include tokens usage for streamed output (#4282)

Use pb.Reply instead of []byte with Reply.GetMessage() in llama grpc to get the proper usage data in reply streaming mode at the last [DONE] frame

Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 core/backend/llm.go                | 12 ++++++++++--
 core/http/endpoints/openai/chat.go |  9 ++++++++-
 pkg/grpc/backend.go                |  2 +-
 pkg/grpc/client.go                 |  6 +++---
 pkg/grpc/embed.go                  |  6 +++---
 5 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index 4491a191..9e121f79 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -117,8 +117,12 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			ss := ""
 
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
-				partialRune = append(partialRune, chars...)
+			err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
+				msg := reply.GetMessage()
+				partialRune = append(partialRune, msg...)
+
+				tokenUsage.Prompt = int(reply.PromptTokens)
+				tokenUsage.Completion = int(reply.Tokens)
 
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@@ -132,6 +136,10 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 
 					partialRune = partialRune[size:]
 				}
+
+				if len(msg) == 0 {
+					tokenCallback("", tokenUsage)
+				}
 			})
 			return LLMResponse{
 				Response: ss,
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 1ac1387e..b03b18bd 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -39,11 +39,15 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		responses <- initialMessage
 
 		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			choices := []schema.Choice{}
+			if s != "" {
+				choices = append(choices, schema.Choice{Delta: &schema.Message{Content: &s}, Index: 0})
+			}
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
+				Choices: choices,
 				Object:  "chat.completion.chunk",
 				Usage: schema.OpenAIUsage{
 					PromptTokens:     usage.Prompt,
@@ -465,6 +469,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				toolsCalled := false
 				for ev := range responses {
 					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+					if len(ev.Choices) == 0 {
+						break
+					}
 					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
 						toolsCalled = true
 					}
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index 21435891..fabc0268 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -37,7 +37,7 @@ type Backend interface {
 	Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
 	Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
 	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
-	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error
+	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
 	GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index 9c8b302e..ca207c3f 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -136,7 +136,7 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 	return client.LoadModel(ctx, in, opts...)
 }
 
-func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
+func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error {
 	if !c.parallel {
 		c.opMutex.Lock()
 		defer c.opMutex.Unlock()
@@ -158,7 +158,7 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 	}
 
 	for {
-		feature, err := stream.Recv()
+		reply, err := stream.Recv()
 		if err == io.EOF {
 			break
 		}
@@ -167,7 +167,7 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 
 			return err
 		}
-		f(feature.GetMessage())
+		f(reply)
 	}
 
 	return nil
diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go
index a5828a5f..79648c5a 100644
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -35,7 +35,7 @@ func (e *embedBackend) LoadModel(ctx context.Context, in *pb.ModelOptions, opts
 	return e.s.LoadModel(ctx, in)
 }
 
-func (e *embedBackend) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
+func (e *embedBackend) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error {
 	bs := &embedBackendServerStream{
 		ctx: ctx,
 		fn:  f,
@@ -97,11 +97,11 @@ func (e *embedBackend) GetTokenMetrics(ctx context.Context, in *pb.MetricsReques
 
 type embedBackendServerStream struct {
 	ctx context.Context
-	fn  func(s []byte)
+	fn  func(reply *pb.Reply)
 }
 
 func (e *embedBackendServerStream) Send(reply *pb.Reply) error {
-	e.fn(reply.GetMessage())
+	e.fn(reply)
 	return nil
 }
 

From 58ff47de2611462a9d9971038cf9f218099df3af Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 28 Nov 2024 22:16:44 +0100
Subject: [PATCH 05/89] feat(bark-cpp): add new bark.cpp backend (#4287)

* feat(bark-cpp): add new bark.cpp backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* build on linux only for now

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* track bark.cpp in CI bumps

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Drop old entries from bumper

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* No need to test rwkv specifically, now part of llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/bump_deps.yaml | 16 +-----
 .gitignore                       |  1 +
 Makefile                         | 37 +++++++++++++-
 backend/go/bark/Makefile         | 25 ++++++++++
 backend/go/bark/gobark.cpp       | 85 ++++++++++++++++++++++++++++++++
 backend/go/bark/gobark.go        | 52 +++++++++++++++++++
 backend/go/bark/gobark.h         |  8 +++
 backend/go/bark/main.go          | 20 ++++++++
 core/http/app_test.go            | 67 -------------------------
 9 files changed, 229 insertions(+), 82 deletions(-)
 create mode 100644 backend/go/bark/Makefile
 create mode 100644 backend/go/bark/gobark.cpp
 create mode 100644 backend/go/bark/gobark.go
 create mode 100644 backend/go/bark/gobark.h
 create mode 100644 backend/go/bark/main.go

diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index c94a134d..8f30f1a0 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,24 +12,12 @@ jobs:
           - repository: "ggerganov/llama.cpp"
             variable: "CPPLLAMA_VERSION"
             branch: "master"
-          - repository: "go-skynet/go-ggml-transformers.cpp"
-            variable: "GOGGMLTRANSFORMERS_VERSION"
-            branch: "master"
-          - repository: "donomii/go-rwkv.cpp"
-            variable: "RWKV_VERSION"
-            branch: "main"
           - repository: "ggerganov/whisper.cpp"
             variable: "WHISPER_CPP_VERSION"
             branch: "master"
-          - repository: "go-skynet/go-bert.cpp"
-            variable: "BERT_VERSION"
-            branch: "master"
-          - repository: "go-skynet/bloomz.cpp"
-            variable: "BLOOMZ_VERSION"
+          - repository: "PABannier/bark.cpp"
+            variable: "BARKCPP_VERSION"
             branch: "main"
-          - repository: "mudler/go-ggllm.cpp"
-            variable: "GOGGLLM_VERSION"
-            branch: "master"
           - repository: "mudler/go-stable-diffusion"
             variable: "STABLEDIFFUSION_VERSION"
             branch: "master"
diff --git a/.gitignore b/.gitignore
index 9f31131f..d821c435 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /sources/
 __pycache__/
 *.a
+*.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
diff --git a/Makefile b/Makefile
index d94b6bad..b198e0d4 100644
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,10 @@ STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
 TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 
+# bark.cpp
+BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+BARKCPP_VERSION?=v1.0.0
+
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
@@ -201,6 +205,13 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+
+ifeq ($(ONNX_OS),linux)
+ifeq ($(ONNX_ARCH),x64)
+	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
+endif
+endif
+
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
@@ -233,6 +244,22 @@ sources/go-llama.cpp:
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 
+sources/bark.cpp:
+	git clone --recursive https://github.com/PABannier/bark.cpp.git sources/bark.cpp && \
+	cd sources/bark.cpp && \
+	git checkout $(BARKCPP_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/bark.cpp/build/libbark.a: sources/bark.cpp
+	cd sources/bark.cpp && \
+	mkdir build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release
+
+backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
+	$(MAKE) -C backend/go/bark libbark.a
+
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 
@@ -302,7 +329,7 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
 
-get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
@@ -343,6 +370,7 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
+	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
@@ -792,6 +820,13 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 
+backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/bark-cpp
+endif
+
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
diff --git a/backend/go/bark/Makefile b/backend/go/bark/Makefile
new file mode 100644
index 00000000..e8902615
--- /dev/null
+++ b/backend/go/bark/Makefile
@@ -0,0 +1,25 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+BUILD_TYPE?=
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+gobark.o:
+	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
+
+libbark.a: gobark.o
+	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
+	$(AR) rcs libbark.a gobark.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
+
+clean:
+	rm -f gobark.o libbark.a
\ No newline at end of file
diff --git a/backend/go/bark/gobark.cpp b/backend/go/bark/gobark.cpp
new file mode 100644
index 00000000..b5f414b8
--- /dev/null
+++ b/backend/go/bark/gobark.cpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <tuple>
+
+#include "bark.h"
+#include "gobark.h"
+#include "common.h"
+#include "ggml.h"
+
+struct bark_context *c;
+
+void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
+    if (step == bark_encoding_step::SEMANTIC) {
+        printf("\rGenerating semantic tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::COARSE) {
+        printf("\rGenerating coarse tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::FINE) {
+        printf("\rGenerating fine tokens... %d%%", progress);
+    }
+    fflush(stdout);
+}
+
+int load_model(char *model) {
+    // initialize bark context
+    struct bark_context_params ctx_params = bark_context_default_params();
+    bark_params params;
+
+    params.model_path = model;
+
+   // ctx_params.verbosity = verbosity;
+    ctx_params.progress_callback = bark_print_progress_callback;
+    ctx_params.progress_callback_user_data = nullptr;
+
+    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        return 1;
+    }
+
+    c = bctx;
+
+    return 0;
+}
+
+int tts(char *text,int  threads, char *dst ) {
+
+    ggml_time_init();
+    const int64_t t_main_start_us = ggml_time_us();
+
+    // generate audio
+    if (!bark_generate_audio(c, text, threads)) {
+        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        return 1;
+    }
+
+    const float *audio_data = bark_get_audio_data(c);
+    if (audio_data == NULL) {
+        fprintf(stderr, "%s: Could not get audio data\n", __func__);
+        return 1;
+    }
+
+    const int audio_arr_size = bark_get_audio_data_size(c);
+
+    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
+
+    write_wav_on_disk(audio_arr, dst);
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_load_us = bark_get_load_time(c);
+        const int64_t t_eval_us = bark_get_eval_time(c);
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+    
+    return 0;
+}
+
+int unload() {
+    bark_free(c);
+}
+
diff --git a/backend/go/bark/gobark.go b/backend/go/bark/gobark.go
new file mode 100644
index 00000000..133a4a39
--- /dev/null
+++ b/backend/go/bark/gobark.go
@@ -0,0 +1,52 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
+// #include <gobark.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Bark struct {
+	base.SingleThread
+	threads int
+}
+
+func (sd *Bark) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	ret := C.load_model(modelFile)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
+
+func (sd *Bark) TTS(opts *pb.TTSRequest) error {
+	t := C.CString(opts.Text)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	threads := C.int(sd.threads)
+
+	ret := C.tts(t, threads, dst)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
diff --git a/backend/go/bark/gobark.h b/backend/go/bark/gobark.h
new file mode 100644
index 00000000..06fb965d
--- /dev/null
+++ b/backend/go/bark/gobark.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model);
+int tts(char *text,int  threads, char *dst );
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/backend/go/bark/main.go b/backend/go/bark/main.go
new file mode 100644
index 00000000..840a687d
--- /dev/null
+++ b/backend/go/bark/main.go
@@ -0,0 +1,20 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/core/http/app_test.go b/core/http/app_test.go
index 28ed0ab9..83fb0e73 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -5,14 +5,12 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"
 
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/core/http"
@@ -913,71 +911,6 @@ var _ = Describe("API test", func() {
 			})
 		})
 
-		Context("backends", func() {
-			It("runs rwkv completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
-
-				stream, err := client.CreateCompletionStream(context.TODO(), openai.CompletionRequest{
-					Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,", Stream: true,
-				})
-				Expect(err).ToNot(HaveOccurred())
-				defer stream.Close()
-
-				tokens := 0
-				text := ""
-				for {
-					response, err := stream.Recv()
-					if errors.Is(err, io.EOF) {
-						break
-					}
-
-					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Text
-					tokens++
-				}
-				Expect(text).ToNot(BeEmpty())
-				Expect(text).To(ContainSubstring("five"))
-				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
-			})
-			It("runs rwkv chat completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				resp, err := client.CreateChatCompletion(context.TODO(),
-					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five"), ContainSubstring("5")))
-
-				stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
-				Expect(err).ToNot(HaveOccurred())
-				defer stream.Close()
-
-				tokens := 0
-				text := ""
-				for {
-					response, err := stream.Recv()
-					if errors.Is(err, io.EOF) {
-						break
-					}
-
-					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Delta.Content
-					tokens++
-				}
-				Expect(text).ToNot(BeEmpty())
-				Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
-
-				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
-			})
-		})
-
 		// See tests/integration/stores_test
 		Context("Stores", Label("stores"), func() {
 

From 55aad5f52528aefb4a1bac7e8d0175c9d232c0ff Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 29 Nov 2024 00:04:31 +0100
Subject: [PATCH 06/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `dc22344088a7ee81a1e4f096459b03a72f24ccdc` (#4288)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b198e0d4..c6e80552 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=3ad5451f3b75809e3033e4e577b9f60bcaf6676a
+CPPLLAMA_VERSION?=dc22344088a7ee81a1e4f096459b03a72f24ccdc
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 2c8a87b1e4f33ee0c4a57d40b9d6554eb3eee4ac Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 29 Nov 2024 22:58:24 +0100
Subject: [PATCH 07/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `3a8e9af402f7893423bdab444aa16c5d9a2d429a` (#4290)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c6e80552..3b0c4b2b 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dc22344088a7ee81a1e4f096459b03a72f24ccdc
+CPPLLAMA_VERSION?=3a8e9af402f7893423bdab444aa16c5d9a2d429a
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 61358e4d355f8a9290fb9764b2ac457f45ac1d7a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 30 Nov 2024 11:02:41 +0100
Subject: [PATCH 08/89] chore(model gallery): add q2.5-32b-slush-i1 (#4292)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b6de0d6a..c1d2a44b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1675,6 +1675,22 @@
     - filename: QwQ-32B-Preview-Q4_K_M.gguf
       sha256: c499801e682e2379528090c50e106837ca1d69dc3bf3ff3a9af830a0eb49cdf6
       uri: huggingface://bartowski/QwQ-32B-Preview-GGUF/QwQ-32B-Preview-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "q2.5-32b-slush-i1"
+  urls:
+    - https://huggingface.co/crestf411/Q2.5-32B-Slush
+    - https://huggingface.co/mradermacher/Q2.5-32B-Slush-i1-GGUF
+  description: |
+    Slush is a two-stage model trained with high LoRA dropout, where stage 1 is a pretraining continuation on the base model, aimed at boosting the model's creativity and writing capabilities. This is then merged into the instruction tune model, and stage 2 is a fine tuning step on top of this to further enhance its roleplaying capabilities and/or to repair any damage caused in the stage 1 merge.
+    This is still early stage. As always, feedback is welcome, and begone if you demand perfection.
+    The second stage, like the Sunfall series, follows the Silly Tavern preset (ChatML), so ymmv in particular if you use some other tool and/or preset.
+  overrides:
+    parameters:
+      model: Q2.5-32B-Slush.i1-Q4_K_M.gguf
+  files:
+    - filename: Q2.5-32B-Slush.i1-Q4_K_M.gguf
+      sha256: 95aecaf43077dabc72d3b556923ede2563325e1c89863800229cfa8b7f1c9659
+      uri: huggingface://mradermacher/Q2.5-32B-Slush-i1-GGUF/Q2.5-32B-Slush.i1-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 1167487f5ec929f0b1fa2d61475ff6ba8d60b04c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 30 Nov 2024 11:15:05 +0100
Subject: [PATCH 09/89] chore(model gallery): add
 freyja-v4.95-maldv-7b-non-fiction-i1 (#4293)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index c1d2a44b..d8268f02 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -7456,6 +7456,27 @@
     - filename: LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
       sha256: 5b88fb4537339996c04e4a1b6ef6a2d555c4103b6378e273ae9c6c5e77af67eb
       uri: huggingface://bartowski/LLAMA-3_8B_Unaligned_BETA-GGUF/LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "freyja-v4.95-maldv-7b-non-fiction-i1"
+  urls:
+    - https://huggingface.co/MrRobotoAI/Freyja-v4.95-maldv-7b-NON-FICTION
+    - https://huggingface.co/mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF
+  description: |
+    This model was merged using the Model Stock merge method using aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K as a base.
+    The following models were included in the merge:
+        maldv/llama-3-fantasy-writer-8b
+        maldv/badger-iota-llama-3-8b
+        maldv/badger-lambda-llama-3-8b
+        maldv/badger-mu-llama-3-8b
+        maldv/badger-kappa-llama-3-8b
+        maldv/badger-writer-llama-3-8b
+  overrides:
+    parameters:
+      model: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
+  files:
+    - filename: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
+      sha256: cdc0f4de6df2ba120835fbd25c2a0ae2af8548f46d2c40c7a018c51c3d19e0c0
+      uri: huggingface://mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF/Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
 - &chatml
   ### ChatML
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master"

From ad31daf03b29d8e3626fd9184e782ac68f265901 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 30 Nov 2024 11:18:25 +0100
Subject: [PATCH 10/89] chore(model gallery): add qwestion-24b (#4294)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index d8268f02..feebd0c0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1691,6 +1691,25 @@
     - filename: Q2.5-32B-Slush.i1-Q4_K_M.gguf
       sha256: 95aecaf43077dabc72d3b556923ede2563325e1c89863800229cfa8b7f1c9659
       uri: huggingface://mradermacher/Q2.5-32B-Slush-i1-GGUF/Q2.5-32B-Slush.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwestion-24b"
+  urls:
+    - https://huggingface.co/CultriX/Qwestion-14B
+    - https://huggingface.co/mradermacher/Qwestion-24B-GGUF
+  description: |
+    This model was merged using the DARE TIES merge method using Qwen/Qwen2.5-14B as a base.
+    The following models were included in the merge:
+    allknowingroger/Qwenslerp2-14B
+    rombodawg/Rombos-LLM-V2.6-Qwen-14b
+    VAGOsolutions/SauerkrautLM-v2-14b-DPO
+    CultriX/Qwen2.5-14B-Wernicke
+  overrides:
+    parameters:
+      model: Qwestion-24B.Q4_K_M.gguf
+  files:
+    - filename: Qwestion-24B.Q4_K_M.gguf
+      sha256: 5d493bd81cfeef66d80101260145ab1d1d0428ef2191edce62b58391bd0fff0e
+      uri: huggingface://mradermacher/Qwestion-24B-GGUF/Qwestion-24B.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 9044b17e4dae062ab6e09a38f9f525aab9404428 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 30 Nov 2024 22:46:07 +0100
Subject: [PATCH 11/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `0c39f44d70d058940fe2afe50cfc789e3e44d756` (#4295)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3b0c4b2b..435447fa 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=3a8e9af402f7893423bdab444aa16c5d9a2d429a
+CPPLLAMA_VERSION?=0c39f44d70d058940fe2afe50cfc789e3e44d756
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 0fcefbc168f0983cbf36bb32a0d1cbfc9c068be7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 1 Dec 2024 10:12:17 +0100
Subject: [PATCH 12/89] chore(model gallery): add volare-i1 (#4296)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index feebd0c0..922adf1c 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4850,6 +4850,23 @@
     - filename: G2-9B-Sugarquill-v0.Q4_K_M.gguf
       sha256: 790a2f1541011b2773e22aa863ef78c8662baaa7eca5875e9573007985120187
       uri: huggingface://QuantFactory/G2-9B-Sugarquill-v0-GGUF/G2-9B-Sugarquill-v0.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "volare-i1"
+  urls:
+    - https://huggingface.co/MoxoffSpA/Volare
+    - https://huggingface.co/mradermacher/Volare-i1-GGUF
+  description: |
+    Volare is an updated version of Gemma7B, specifically fine-tuned with SFT and LoRA adjustments.
+        It's trained on publicly available datasets, like SQUAD-it, and datasets we've created in-house.
+        it's designed to understand and maintain context, making it ideal for Retrieval Augmented Generation (RAG) tasks and applications requiring contextual awareness.
+    Italian dataset.
+  overrides:
+    parameters:
+      model: Volare.i1-Q4_K_M.gguf
+  files:
+    - filename: Volare.i1-Q4_K_M.gguf
+      sha256: fa8fb9d4cb19fcb44be8d53561c9e2840f45aed738de545983ebb158ebba461b
+      uri: huggingface://mradermacher/Volare-i1-GGUF/Volare.i1-Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png

From bc5d1f255b34b689da5853ffce5ee300f5393e54 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 1 Dec 2024 10:12:35 +0100
Subject: [PATCH 13/89] chore(model gallery): add skywork-o1-open-llama-3.1-8b
 (#4297)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 922adf1c..f2f112cc 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3302,6 +3302,29 @@
     - filename: Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
       sha256: 3fad2c96aa9b9de19c2cda0f88a381c47ac768ca03a95059d9f6c439791f8592
       uri: huggingface://bartowski/Llama-3.1-Tulu-3-8B-SFT-GGUF/Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
+- !!merge <<: *llama31
+  icon: https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B/resolve/main/misc/misc_fig.jpg
+  name: "skywork-o1-open-llama-3.1-8b"
+  urls:
+    - https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B
+    - https://huggingface.co/QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF
+  description: |
+    We are excited to announce the release of the Skywork o1 Open model series, developed by the Skywork team at Kunlun Inc. This groundbreaking release introduces a series of models that incorporate o1-like slow thinking and reasoning capabilities. The Skywork o1 Open model series includes three advanced models:
+
+    Skywork o1 Open-Llama-3.1-8B: A robust chat model trained on Llama-3.1-8B, enhanced significantly with "o1-style" data to improve reasoning skills.
+
+    Skywork o1 Open-PRM-Qwen-2.5-1.5B: A specialized model designed to enhance reasoning capability through incremental process rewards, ideal for complex problem solving at a smaller scale.
+
+    Skywork o1 Open-PRM-Qwen-2.5-7B: Extends the capabilities of the 1.5B model by scaling up to handle more demanding reasoning tasks, pushing the boundaries of AI reasoning.
+
+    Different from mere reproductions of the OpenAI o1 model, the Skywork o1 Open model series not only exhibits innate thinking, planning, and reflecting capabilities in its outputs, but also shows significant improvements in reasoning skills on standard benchmarks. This series represents a strategic advancement in AI capabilities, moving a previously weaker base model towards the state-of-the-art (SOTA) in reasoning tasks.
+  overrides:
+    parameters:
+      model: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+  files:
+    - filename: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+      sha256: ef6a203ba585aab14f5d2ec463917a45b3ac571abd89c39e9a96a5e395ea8eea
+      uri: huggingface://QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF/Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From 9c9359fc9662648800b5d76c9f123f8b32b1e54e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 1 Dec 2024 10:12:48 +0100
Subject: [PATCH 14/89] chore(model gallery): add teleut-7b (#4298)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index f2f112cc..bffeb067 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1710,6 +1710,21 @@
     - filename: Qwestion-24B.Q4_K_M.gguf
       sha256: 5d493bd81cfeef66d80101260145ab1d1d0428ef2191edce62b58391bd0fff0e
       uri: huggingface://mradermacher/Qwestion-24B-GGUF/Qwestion-24B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "teleut-7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/UqIi8eztdptvt52Mak_1K.png
+  urls:
+    - https://huggingface.co/allura-org/Teleut-7b
+    - https://huggingface.co/QuantFactory/Teleut-7b-GGUF
+  description: |
+    A replication attempt of Tulu 3 on the Qwen 2.5 base models.
+  overrides:
+    parameters:
+      model: Teleut-7b.Q4_K_M.gguf
+  files:
+    - filename: Teleut-7b.Q4_K_M.gguf
+      sha256: 844a633ea01d793c638e99f2e07413606b3812b759e9264fbaf69c8d94eaa093
+      uri: huggingface://QuantFactory/Teleut-7b-GGUF/Teleut-7b.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 28594336e989a1c61723834fc54da8013f0b1c0f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 1 Dec 2024 22:44:59 +0100
Subject: [PATCH 15/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `5e1ed95583ca552a98d8528b73e1ff81249c2bf9` (#4299)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 435447fa..bbc8a7dd 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=0c39f44d70d058940fe2afe50cfc789e3e44d756
+CPPLLAMA_VERSION?=5e1ed95583ca552a98d8528b73e1ff81249c2bf9
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From e51792784a8add4e8335415e00804d2768bcae94 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 2 Dec 2024 19:13:26 +0100
Subject: [PATCH 16/89] chore(deps): bump grpcio to 1.68.1 (#4301)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/autogptq/requirements.txt              | 2 +-
 backend/python/bark/requirements.txt                  | 2 +-
 backend/python/common/template/requirements.txt       | 2 +-
 backend/python/coqui/requirements.txt                 | 2 +-
 backend/python/diffusers/requirements.txt             | 2 +-
 backend/python/exllama2/requirements.txt              | 2 +-
 backend/python/mamba/requirements.txt                 | 2 +-
 backend/python/openvoice/requirements-intel.txt       | 2 +-
 backend/python/openvoice/requirements.txt             | 2 +-
 backend/python/parler-tts/requirements.txt            | 2 +-
 backend/python/rerankers/requirements.txt             | 2 +-
 backend/python/sentencetransformers/requirements.txt  | 2 +-
 backend/python/transformers-musicgen/requirements.txt | 2 +-
 backend/python/transformers/requirements.txt          | 2 +-
 backend/python/vall-e-x/requirements.txt              | 2 +-
 backend/python/vllm/install.sh                        | 2 +-
 backend/python/vllm/requirements.txt                  | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt
index 2e9d5ad6..22408f47 100644
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 transformers
\ No newline at end of file
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
index 1f9e2b35..3fca1de5 100644
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
index e4e07678..893dc812 100644
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 grpcio-tools
\ No newline at end of file
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
index d3313cc6..57638588 100644
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 packaging==24.1
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
index 4c3e703d..71832ead 100644
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.68.0
+grpcio==1.68.1
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
index 3a74e6e8..408eb318 100644
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 wheel
diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt
index 99715d67..8e4eabf1 100644
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/openvoice/requirements-intel.txt b/backend/python/openvoice/requirements-intel.txt
index d38351b1..7908a889 100644
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
index 57557c88..6806d3e1 100644
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 librosa
 faster-whisper
diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
index 1555a2cb..75ea8a59 100644
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 certifi
 llvmlite==0.43.0
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
index 99715d67..8e4eabf1 100644
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements.txt b/backend/python/sentencetransformers/requirements.txt
index b39ef126..b9dacf9b 100644
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 datasets
diff --git a/backend/python/transformers-musicgen/requirements.txt b/backend/python/transformers-musicgen/requirements.txt
index 0a234d10..2e46b08f 100644
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 scipy==1.14.0
 certifi
\ No newline at end of file
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
index 450071cf..b556b9f1 100644
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
index 99715d67..8e4eabf1 100644
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 98e2e57e..0183a928 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
             git clone https://github.com/vllm-project/vllm
         fi
         pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.0 protobuf bitsandbytes
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
             uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
             VLLM_TARGET_DEVICE=cpu python setup.py install
         popd
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
index b5872310..d981fd99 100644
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 setuptools
\ No newline at end of file

From 45b91d501e80eb37d8b2e8097e219c4496b84574 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Dec 2024 22:47:37 +0000
Subject: [PATCH 17/89] chore(deps): Bump docs/themes/hugo-theme-relearn from
 `28fce6b` to `be85052` (#4305)

chore(deps): Bump docs/themes/hugo-theme-relearn

Bumps [docs/themes/hugo-theme-relearn](https://github.com/McShelby/hugo-theme-relearn) from `28fce6b` to `be85052`.
- [Release notes](https://github.com/McShelby/hugo-theme-relearn/releases)
- [Commits](https://github.com/McShelby/hugo-theme-relearn/compare/28fce6b04c414523280c53ee02f9f3a94d9d23da...be85052efea3a0aaef45ecb0126d390c1bbac760)

---
updated-dependencies:
- dependency-name: docs/themes/hugo-theme-relearn
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/themes/hugo-theme-relearn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/themes/hugo-theme-relearn b/docs/themes/hugo-theme-relearn
index 28fce6b0..be85052e 160000
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
@@ -1 +1 @@
-Subproject commit 28fce6b04c414523280c53ee02f9f3a94d9d23da
+Subproject commit be85052efea3a0aaef45ecb0126d390c1bbac760

From 5f339629327c5ae0995021f548e842426fa2a909 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 3 Dec 2024 09:20:59 +0100
Subject: [PATCH 18/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `8648c521010620c2daccfa1d26015c668ba2c717` (#4307)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index bbc8a7dd..a96b9c82 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=5e1ed95583ca552a98d8528b73e1ff81249c2bf9
+CPPLLAMA_VERSION?=8648c521010620c2daccfa1d26015c668ba2c717
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 7b70f0543b0fb401552a37b7e56cb17f2225a441 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 3 Dec 2024 18:55:49 +0100
Subject: [PATCH 19/89] chore(model gallery): add sparse-llama-3.1-8b-2of4
 (#4309)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index bffeb067..3e3c4e3e 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3340,6 +3340,20 @@
     - filename: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
       sha256: ef6a203ba585aab14f5d2ec463917a45b3ac571abd89c39e9a96a5e395ea8eea
       uri: huggingface://QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF/Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "sparse-llama-3.1-8b-2of4"
+  urls:
+    - https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
+    - https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
+  description: |
+    This is the 2:4 sparse version of Llama-3.1-8B. On the OpenLLM benchmark (version 1), it achieves an average score of 62.16, compared to 63.19 for the dense model—demonstrating a 98.37% accuracy recovery. On the Mosaic Eval Gauntlet benchmark (version v0.3), it achieves an average score of 53.85, versus 55.34 for the dense model—representing a 97.3% accuracy recovery.
+  overrides:
+    parameters:
+      model: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+  files:
+    - filename: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+      sha256: c481e7089ffaedd5ae8c74dccc7fb45f6509640b661fa086ae979f6fefc3fdba
+      uri: huggingface://QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF/Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From 236a60bab8f4607646a8d4768baaf2712430c344 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 3 Dec 2024 18:56:03 +0100
Subject: [PATCH 20/89] chore(model gallery): add qwen2.5-7b-homercreative-mix
 (#4310)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 3e3c4e3e..a2381f3b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1725,6 +1725,31 @@
     - filename: Teleut-7b.Q4_K_M.gguf
       sha256: 844a633ea01d793c638e99f2e07413606b3812b759e9264fbaf69c8d94eaa093
       uri: huggingface://QuantFactory/Teleut-7b-GGUF/Teleut-7b.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-7b-homercreative-mix"
+  urls:
+    - https://huggingface.co/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
+    - https://huggingface.co/QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF
+  description: |
+    ZeroXClem/Qwen2.5-7B-HomerCreative-Mix is an advanced language model meticulously crafted by merging four pre-trained models using the powerful mergekit framework. This fusion leverages the Model Stock merge method to combine the creative prowess of Qandora, the instructive capabilities of Qwen-Instruct-Fusion, the sophisticated blending of HomerSlerp1, and the foundational conversational strengths of Homer-v0.5-Qwen2.5-7B. The resulting model excels in creative text generation, contextual understanding, and dynamic conversational interactions.
+    🚀 Merged Models
+
+    This model merge incorporates the following:
+
+        bunnycore/Qandora-2.5-7B-Creative: Specializes in creative text generation, enhancing the model's ability to produce imaginative and diverse content.
+
+        bunnycore/Qwen2.5-7B-Instruct-Fusion: Focuses on instruction-following capabilities, improving the model's performance in understanding and executing user commands.
+
+        allknowingroger/HomerSlerp1-7B: Utilizes spherical linear interpolation (SLERP) to blend model weights smoothly, ensuring a harmonious integration of different model attributes.
+
+        newsbang/Homer-v0.5-Qwen2.5-7B: Acts as the foundational conversational model, providing robust language comprehension and generation capabilities.
+  overrides:
+    parameters:
+      model: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+      sha256: fc3fdb41e068646592f89a8ae62d7b330f2bd4e97bf615aef2977930977c8ba5
+      uri: huggingface://QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF/Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 074b52bbfeab1cd252fd1debf503c81be3e4046d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 3 Dec 2024 18:56:53 +0100
Subject: [PATCH 21/89] chore(model gallery): add bggpt-gemma-2-2.6b-it-v1.0
 (#4311)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index a2381f3b..fd0e27e8 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4944,6 +4944,22 @@
     - filename: Volare.i1-Q4_K_M.gguf
       sha256: fa8fb9d4cb19fcb44be8d53561c9e2840f45aed738de545983ebb158ebba461b
       uri: huggingface://mradermacher/Volare-i1-GGUF/Volare.i1-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "bggpt-gemma-2-2.6b-it-v1.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/637e1f8cf7e01589cc17bf7e/p6d0YFHjWCQ3S12jWqO1m.png
+  urls:
+    - https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
+    - https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
+  description: |
+    INSAIT introduces BgGPT-Gemma-2-2.6B-IT-v1.0, a state-of-the-art Bulgarian language model based on google/gemma-2-2b and google/gemma-2-2b-it. BgGPT-Gemma-2-2.6B-IT-v1.0 is free to use and distributed under the Gemma Terms of Use. This model was created by INSAIT, part of Sofia University St. Kliment Ohridski, in Sofia, Bulgaria.
+    The model was built on top of Google’s Gemma 2 2B open models. It was continuously pre-trained on around 100 billion tokens (85 billion in Bulgarian) using the Branch-and-Merge strategy INSAIT presented at EMNLP’24, allowing the model to gain outstanding Bulgarian cultural and linguistic capabilities while retaining its English performance. During the pre-training stage, we use various datasets, including Bulgarian web crawl data, freely available datasets such as Wikipedia, a range of specialized Bulgarian datasets sourced by the INSAIT Institute, and machine translations of popular English datasets. The model was then instruction-fine-tuned on a newly constructed Bulgarian instruction dataset created using real-world conversations. For more information check our blogpost.
+  overrides:
+    parameters:
+      model: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+  files:
+    - filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+      sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319
+      uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png

From 44a5dac31223ee1a49800acb11b798e71bc3e0b1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 3 Dec 2024 22:41:22 +0100
Subject: [PATCH 22/89] feat(backend): add stablediffusion-ggml (#4289)

* feat(backend): add stablediffusion-ggml

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* chore(ci): track stablediffusion-ggml

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Use default scheduler and sampler if not specified

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Move cfg scale out of diffusers block

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Make it working

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: set free_params_immediately to false to call the model in sequence

https://github.com/leejet/stable-diffusion.cpp/issues/366

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/workflows/bump_deps.yaml              |   3 +
 Makefile                                      |  45 +++-
 backend/backend.proto                         |   2 +
 .../go/image/stablediffusion-ggml/Makefile    |  21 ++
 .../go/image/stablediffusion-ggml/gosd.cpp    | 228 ++++++++++++++++++
 backend/go/image/stablediffusion-ggml/gosd.go |  96 ++++++++
 backend/go/image/stablediffusion-ggml/gosd.h  |   8 +
 backend/go/image/stablediffusion-ggml/main.go |  20 ++
 core/backend/options.go                       |   3 +-
 core/config/backend_config.go                 |  23 +-
 .../content/docs/features/image-generation.md |   6 +-
 gallery/flux.yaml                             |   3 +-
 12 files changed, 437 insertions(+), 21 deletions(-)
 create mode 100644 backend/go/image/stablediffusion-ggml/Makefile
 create mode 100644 backend/go/image/stablediffusion-ggml/gosd.cpp
 create mode 100644 backend/go/image/stablediffusion-ggml/gosd.go
 create mode 100644 backend/go/image/stablediffusion-ggml/gosd.h
 create mode 100644 backend/go/image/stablediffusion-ggml/main.go

diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index 8f30f1a0..092110df 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -18,6 +18,9 @@ jobs:
           - repository: "PABannier/bark.cpp"
             variable: "BARKCPP_VERSION"
             branch: "main"
+          - repository: "leejet/stable-diffusion.cpp"
+            variable: "STABLEDIFFUSION_GGML_VERSION"
+            branch: "master"
           - repository: "mudler/go-stable-diffusion"
             variable: "STABLEDIFFUSION_VERSION"
             branch: "master"
diff --git a/Makefile b/Makefile
index a96b9c82..d85f44e7 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,10 @@ TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
 
+# stablediffusion.cpp (ggml)
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=4570715727f35e5a07a76796d823824c8f42206c
+
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
@@ -209,6 +213,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ifeq ($(ONNX_OS),linux)
 ifeq ($(ONNX_ARCH),x64)
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
+	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
 endif
 endif
 
@@ -244,15 +249,19 @@ sources/go-llama.cpp:
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+
+## bark.cpp
 sources/bark.cpp:
-	git clone --recursive https://github.com/PABannier/bark.cpp.git sources/bark.cpp && \
+	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
 	cd sources/bark.cpp && \
 	git checkout $(BARKCPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 
 sources/bark.cpp/build/libbark.a: sources/bark.cpp
 	cd sources/bark.cpp && \
-	mkdir build && \
+	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
@@ -260,9 +269,6 @@ sources/bark.cpp/build/libbark.a: sources/bark.cpp
 backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
 	$(MAKE) -C backend/go/bark libbark.a
 
-sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
-	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-
 ## go-piper
 sources/go-piper:
 	mkdir -p sources/go-piper
@@ -276,7 +282,7 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
 
-## stable diffusion
+## stable diffusion (onnx)
 sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
@@ -289,6 +295,30 @@ sources/go-stable-diffusion:
 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 
+## stablediffusion (ggml)
+sources/stablediffusion-ggml.cpp:
+	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+	cd sources/stablediffusion-ggml.cpp && \
+	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
+	cd sources/stablediffusion-ggml.cpp && \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release
+
+backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
+	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
+
+backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion-ggml
+endif
+
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
 	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
@@ -329,7 +359,7 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
 
-get-sources: sources/go-llama.cpp sources/go-piper sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
@@ -372,6 +402,7 @@ clean: ## Remove build related file
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
diff --git a/backend/backend.proto b/backend/backend.proto
index d6e8f236..48b0101b 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -240,6 +240,8 @@ message ModelOptions {
 
   repeated string LoraAdapters = 60;
   repeated float LoraScales = 61;
+
+  repeated string Options = 62;
 }
 
 message Result {
diff --git a/backend/go/image/stablediffusion-ggml/Makefile b/backend/go/image/stablediffusion-ggml/Makefile
new file mode 100644
index 00000000..cca9bf6e
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -0,0 +1,21 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+BUILD_TYPE?=
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+gosd.o:
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
+
+libsd.a: gosd.o
+	cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
+	$(AR) rcs libsd.a gosd.o
+
+clean:
+	rm -f gosd.o libsd.a
\ No newline at end of file
diff --git a/backend/go/image/stablediffusion-ggml/gosd.cpp b/backend/go/image/stablediffusion-ggml/gosd.cpp
new file mode 100644
index 00000000..8653aa1e
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -0,0 +1,228 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include "gosd.h"
+
+// #include "preprocessing.hpp"
+#include "flux.hpp"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+sd_ctx_t* sd_c;
+
+sample_method_t sample_method;
+
+int load_model(char *model, char* options[], int threads, int diff) {
+    fprintf (stderr, "Loading model!\n");
+
+    char *stableDiffusionModel = "";
+    if (diff == 1 ) {
+        stableDiffusionModel = model;
+        model = "";
+    }
+
+    // decode options. Options are in form optname:optvale, or if booleans only optname.
+    char *clip_l_path  = "";
+    char *clip_g_path  = "";
+    char *t5xxl_path  = "";
+    char *vae_path  = "";
+    char *scheduler = "";
+    char *sampler = "";
+
+    // If options is not NULL, parse options
+    for (int i = 0; options[i] != NULL; i++) {
+        char *optname = strtok(options[i], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "clip_l_path")) {
+            clip_l_path = optval;
+        }
+        if (!strcmp(optname, "clip_g_path")) {
+            clip_g_path = optval;
+        }
+        if (!strcmp(optname, "t5xxl_path")) {
+            t5xxl_path = optval;
+        }
+        if (!strcmp(optname, "vae_path")) {
+            vae_path = optval;
+        }
+        if (!strcmp(optname, "scheduler")) {
+            scheduler = optval;
+        }
+        if (!strcmp(optname, "sampler")) {
+            sampler = optval;
+        }
+    }
+
+    int sample_method_found = -1;
+    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+        if (!strcmp(sampler, sample_method_str[m])) {
+            sample_method_found = m;
+        }
+    }
+    if (sample_method_found == -1) {
+        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
+        sample_method_found = EULER_A;
+    }
+    sample_method = (sample_method_t)sample_method_found;
+
+    int schedule_found            = -1;
+    for (int d = 0; d < N_SCHEDULES; d++) {
+        if (!strcmp(scheduler, schedule_str[d])) {
+            schedule_found = d;
+                fprintf (stderr, "Found scheduler: %s\n", scheduler);
+
+        }
+    }
+
+    if (schedule_found == -1) {
+        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
+        schedule_found = DEFAULT;
+    }
+
+    schedule_t schedule = (schedule_t)schedule_found;
+    
+    fprintf (stderr, "Creating context\n");
+    sd_ctx_t* sd_ctx = new_sd_ctx(model,
+                                  clip_l_path,
+                                  clip_g_path,
+                                  t5xxl_path,
+                                  stableDiffusionModel,
+                                  vae_path,
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  false,
+                                  false,
+                                  false,
+                                  threads,
+                                  SD_TYPE_COUNT,
+                                  STD_DEFAULT_RNG,
+                                  schedule,
+                                  false,
+                                  false,
+                                  false,
+                                  false);
+
+    if (sd_ctx == NULL) {
+        fprintf (stderr, "failed loading model (generic error)\n");
+        return 1;
+    }
+    fprintf (stderr, "Created context: OK\n");
+
+    sd_c = sd_ctx;
+
+    return 0;
+}
+
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
+
+    sd_image_t* results;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+
+    fprintf (stderr, "Generating image\n");
+
+    results = txt2img(sd_c,
+                            text,
+                            negativeText,
+                            -1, //clip_skip
+                            cfg_scale, // sfg_scale
+                            3.5f,
+                            width,
+                            height,
+                            sample_method, 
+                            steps,
+                            seed,
+                            1,
+                            NULL,
+                            0.9f,
+                            20.f,
+                            false,
+                            "",
+                            skip_layers.data(),
+                            skip_layers.size(),
+                            0,
+                            0.01,
+                            0.2);
+
+    if (results == NULL) {
+        fprintf (stderr, "NO results\n");
+        return 1;
+    }
+
+    if (results[0].data == NULL) {
+        fprintf (stderr, "Results with no data\n");
+        return 1;
+    }
+
+    fprintf (stderr, "Writing PNG\n");
+
+    fprintf (stderr, "DST: %s\n", dst);
+    fprintf (stderr, "Width: %d\n", results[0].width);
+    fprintf (stderr, "Height: %d\n", results[0].height);
+    fprintf (stderr, "Channel: %d\n", results[0].channel);
+    fprintf (stderr, "Data: %p\n", results[0].data);
+
+    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
+                       results[0].data, 0, NULL);
+    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+
+    // TODO: free results. Why does it crash?
+
+    free(results[0].data);
+    results[0].data = NULL;
+    free(results);
+    fprintf (stderr, "gen_image is done", dst);
+
+    return 0;
+}
+
+int unload() {
+    free_sd_ctx(sd_c);
+}
+
diff --git a/backend/go/image/stablediffusion-ggml/gosd.go b/backend/go/image/stablediffusion-ggml/gosd.go
new file mode 100644
index 00000000..29d0033d
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -0,0 +1,96 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
+// #include <gosd.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/utils"
+)
+
+type SDGGML struct {
+	base.SingleThread
+	threads      int
+	sampleMethod string
+	cfgScale     float32
+}
+
+func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	var options **C.char
+	// prepare the options array to pass to C
+
+	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
+	length := C.size_t(len(opts.Options))
+	options = (**C.char)(C.malloc(length * size))
+	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
+
+	var diffusionModel int
+
+	var oo []string
+	for _, op := range opts.Options {
+		if op == "diffusion_model" {
+			diffusionModel = 1
+			continue
+		}
+
+		// If it's an option path, we resolve absolute path from the model path
+		if strings.Contains(op, ":") && strings.Contains(op, "path") {
+			data := strings.Split(op, ":")
+			data[1] = filepath.Join(opts.ModelPath, data[1])
+			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
+				oo = append(oo, strings.Join(data, ":"))
+			}
+		} else {
+			oo = append(oo, op)
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
+
+	for i, x := range oo {
+		view[i] = C.CString(x)
+	}
+
+	sd.cfgScale = opts.CFGScale
+
+	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
+	if ret != 0 {
+		return fmt.Errorf("could not load model")
+	}
+
+	return nil
+}
+
+func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
+	t := C.CString(opts.PositivePrompt)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	negative := C.CString(opts.NegativePrompt)
+	defer C.free(unsafe.Pointer(negative))
+
+	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
diff --git a/backend/go/image/stablediffusion-ggml/gosd.h b/backend/go/image/stablediffusion-ggml/gosd.h
new file mode 100644
index 00000000..5297e871
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model, char* options[], int threads, int diffusionModel);
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/backend/go/image/stablediffusion-ggml/main.go b/backend/go/image/stablediffusion-ggml/main.go
new file mode 100644
index 00000000..acee74fa
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -0,0 +1,20 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/core/backend/options.go b/core/backend/options.go
index c6591222..1f88122f 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -122,7 +122,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		PipelineType:         c.Diffusers.PipelineType,
-		CFGScale:             c.Diffusers.CFGScale,
+		CFGScale:             c.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
 		LoraAdapters:         c.LoraAdapters,
@@ -132,6 +132,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
+		Options:              c.Options,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(ctxSize),
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 998f22a3..1de540f9 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -72,6 +72,8 @@ type BackendConfig struct {
 
 	Description string `yaml:"description"`
 	Usage       string `yaml:"usage"`
+
+	Options []string `yaml:"options"`
 }
 
 type File struct {
@@ -97,16 +99,15 @@ type GRPC struct {
 }
 
 type Diffusers struct {
-	CUDA             bool    `yaml:"cuda"`
-	PipelineType     string  `yaml:"pipeline_type"`
-	SchedulerType    string  `yaml:"scheduler_type"`
-	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
-	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
-	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
-	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
-	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
-	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
-	ControlNet       string  `yaml:"control_net"`
+	CUDA             bool   `yaml:"cuda"`
+	PipelineType     string `yaml:"pipeline_type"`
+	SchedulerType    string `yaml:"scheduler_type"`
+	EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	IMG2IMG          bool   `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int    `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+	ControlNet       string `yaml:"control_net"`
 }
 
 // LLMConfig is a struct that holds the configuration that are
@@ -164,6 +165,8 @@ type LLMConfig struct {
 	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
 	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
+
+	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
 
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
diff --git a/docs/content/docs/features/image-generation.md b/docs/content/docs/features/image-generation.md
index 5bd12575..864ea040 100644
--- a/docs/content/docs/features/image-generation.md
+++ b/docs/content/docs/features/image-generation.md
@@ -194,8 +194,9 @@ diffusers:
   pipeline_type: StableDiffusionPipeline
   enable_parameters: "negative_prompt,num_inference_steps,clip_skip"
   scheduler_type: "k_dpmpp_sde"
-  cfg_scale: 8
   clip_skip: 11
+
+cfg_scale: 8
 ```
 
 #### Configuration parameters
@@ -302,7 +303,8 @@ cuda: true
 diffusers:
   pipeline_type: StableDiffusionDepth2ImgPipeline
   enable_parameters: "negative_prompt,num_inference_steps,image"
-  cfg_scale: 6
+
+cfg_scale: 6
 ```
 
 ```bash
diff --git a/gallery/flux.yaml b/gallery/flux.yaml
index bb75b53b..a859d801 100644
--- a/gallery/flux.yaml
+++ b/gallery/flux.yaml
@@ -11,4 +11,5 @@ config_file: |
     cuda: true
     enable_parameters: num_inference_steps
     pipeline_type: FluxPipeline
-    cfg_scale: 0
+
+  cfg_scale: 0

From feb54e65c2bfb0d24813effc51a6d5c6875072bd Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 4 Dec 2024 00:53:11 +0100
Subject: [PATCH 23/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `cc98896db858df7aa40d0e16a505883ef196a482` (#4312)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d85f44e7..dad02937 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=8648c521010620c2daccfa1d26015c668ba2c717
+CPPLLAMA_VERSION?=cc98896db858df7aa40d0e16a505883ef196a482
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From cc04b62d3a6b335786337ab0862baad19c9e0c11 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:15:35 +0100
Subject: [PATCH 24/89] chore(model gallery): add cybercore-qwen-2.1-7b (#4314)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index fd0e27e8..634b5859 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1750,6 +1750,23 @@
     - filename: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
       sha256: fc3fdb41e068646592f89a8ae62d7b330f2bd4e97bf615aef2977930977c8ba5
       uri: huggingface://QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF/Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "cybercore-qwen-2.1-7b"
+  urls:
+    - https://huggingface.co/bunnycore/CyberCore-Qwen-2.1-7B
+    - https://huggingface.co/QuantFactory/CyberCore-Qwen-2.1-7B-GGUF
+  description: |
+    This model was merged using the TIES merge method using rombodawg/Rombos-LLM-V2.5-Qwen-7b as a base.
+    Models Merged
+    fblgit/cybertron-v4-qw7B-UNAMGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
+    fblgit/cybertron-v4-qw7B-MGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
+  overrides:
+    parameters:
+      model: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+  files:
+    - filename: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+      sha256: 726042707a4cec29ca0355b4dc7c53a807b307d08aa8a3d4a9e76aefbbbcaadf
+      uri: huggingface://QuantFactory/CyberCore-Qwen-2.1-7B-GGUF/CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From dc04a4386801d7e8406367dae51ce720f4108a7d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:15:47 +0100
Subject: [PATCH 25/89] chore(model gallery): add chatty-harry_v3.0 (#4315)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 634b5859..756c6bb0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4189,6 +4189,23 @@
     - filename: magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
       sha256: 07e91d2c6d4e42312e65a69c54f16be467575f7a596fe052993b388e38b90d76
       uri: huggingface://mradermacher/magnum-12b-v2.5-kto-i1-GGUF/magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "chatty-harry_v3.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/66c1cc08453a7ef6c5fe657a/0KzNTEtn2kJJQsw4lQeY0.png
+  urls:
+    - https://huggingface.co/Triangle104/Chatty-Harry_V3.0
+    - https://huggingface.co/QuantFactory/Chatty-Harry_V3.0-GGUF
+  description: |
+    This model was merged using the TIES merge method using Triangle104/ChatWaifu_Magnum_V0.2 as a base.
+    The following models were included in the merge: elinas/Chronos-Gold-12B-1.0
+  overrides:
+    parameters:
+      model: Chatty-Harry_V3.0.Q4_K_M.gguf
+  files:
+    - filename: Chatty-Harry_V3.0.Q4_K_M.gguf
+      sha256: 54b63bb74498576ca77b801ed096657a93cc2f6b71d707c3605fdb394bd3e622
+      uri: huggingface://QuantFactory/Chatty-Harry_V3.0-GGUF/Chatty-Harry_V3.0.Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"

From 50f71f73d75b3603c4dfc0fd7d98ec98bf6ce456 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:17:23 +0100
Subject: [PATCH 26/89] chore(model gallery): add homercreativeanvita-mix-qw7b
 (#4316)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 756c6bb0..5d71bf0d 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1767,6 +1767,30 @@
     - filename: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
       sha256: 726042707a4cec29ca0355b4dc7c53a807b307d08aa8a3d4a9e76aefbbbcaadf
       uri: huggingface://QuantFactory/CyberCore-Qwen-2.1-7B-GGUF/CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "homercreativeanvita-mix-qw7b"
+  icon: https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B/resolve/main/HomerCreativeAnvita.jpeg
+  urls:
+    - https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B
+    - https://huggingface.co/QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF
+  description: |
+    This model is currently ranked #1 on the Open LLM Leaderboard among models up to 13B parameters!
+    Merge Method
+
+    This model was merged using the SLERP merge method.
+    Models Merged
+
+    The following models were included in the merge:
+
+        ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix
+        ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
+  overrides:
+    parameters:
+      model: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+  files:
+    - filename: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+      sha256: a356f279a104bff0bbc2ef7ec136c1e774153de8893bf988083e96fb7f4bc053
+      uri: huggingface://QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF/HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 4307ae5d5247fa6fcdbfa81da5a10f6e78de906e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:26:51 +0100
Subject: [PATCH 27/89] chore(model gallery): add flux.1-dev-ggml (#4317)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/flux-ggml.yaml | 12 ++++++++++++
 gallery/index.yaml     | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 gallery/flux-ggml.yaml

diff --git a/gallery/flux-ggml.yaml b/gallery/flux-ggml.yaml
new file mode 100644
index 00000000..5738d584
--- /dev/null
+++ b/gallery/flux-ggml.yaml
@@ -0,0 +1,12 @@
+---
+name: "flux-ggml"
+
+config_file: |
+    backend: stablediffusion-ggml
+    step: 25
+    options:
+    - "diffusion_model"
+    - "clip_l_path:clip_l.safetensors"
+    - "t5xxl_path:t5xxl_fp16.safetensors"
+    - "vae_path:ae.safetensors"
+    - "sampler:euler"
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 5d71bf0d..88d09d6c 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -8906,6 +8906,43 @@
   overrides:
     parameters:
       model: black-forest-labs/FLUX.1-schnell
+- name: flux.1-dev-ggml
+  license: flux-1-dev-non-commercial-license
+  url: "github:mudler/LocalAI/gallery/flux-ggml.yaml@master"
+  description: |
+    FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our blog post.
+    Key Features
+        Cutting-edge output quality, second only to our state-of-the-art model FLUX.1 [pro].
+        Competitive prompt following, matching the performance of closed source alternatives .
+        Trained using guidance distillation, making FLUX.1 [dev] more efficient.
+        Open weights to drive new scientific research, and empower artists to develop innovative workflows.
+        Generated outputs can be used for personal, scientific, and commercial purposes as described in the flux-1-dev-non-commercial-license.
+    This model is quantized with GGUF
+  urls:
+    - https://huggingface.co/black-forest-labs/FLUX.1-dev
+    - https://huggingface.co/city96/FLUX.1-dev-gguf
+  tags:
+    - text-to-image
+    - flux
+    - gpu
+    - cpu
+  icon: https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/schnell_grid.jpeg
+  overrides:
+    parameters:
+      model: flux1-dev-Q2_K.gguf
+  files:
+    - filename: "flux1-dev-Q2_K.gguf"
+      sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"
+      uri: "huggingface://city96/FLUX.1-dev-gguf/flux1-dev-Q2_K.gguf"
+    - filename: ae.safetensors
+      sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38
+      uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors
+    - filename: clip_l.safetensors
+      sha256: 660c6f5b1abae9dc498ac2d21e1347d2abdb0cf6c0c0c8576cd796491d9a6cdd
+      uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors
+    - filename: t5xxl_fp16.safetensors
+      sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
+      uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
 - &whisper
   ## Whisper
   url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master"

From 3c0ac49d902798edb963e0387a0373af09491b2d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 09:32:42 +0100
Subject: [PATCH 28/89] chore(model gallery): add bark-cpp-small (#4318)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 88d09d6c..02a58d22 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -9728,3 +9728,22 @@
     - filename: silero-vad.onnx
       uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
       sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
+- name: "bark-cpp-small"
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  license: mit
+  urls:
+    - https://huggingface.co/suno/bark
+    - https://huggingface.co/Green-Sky/bark-ggml
+  description: |
+    Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints ready for inference.
+  tags:
+    - tts
+    - cpu
+  overrides:
+    backend: bark-cpp
+    parameters:
+      model: bark-small_weights-f16.bin
+  files:
+    - filename: bark-small_weights-f16.bin
+      uri: https://huggingface.co/Green-Sky/bark-ggml/resolve/main/bark-small_weights-f16.bin
+      sha256: de1ece17e8319537b3a7909baebbd28affab23c942d5d57e648d622af4e2feaa

From cf4f024420e5976a21eb6754b1e72fb12ff1c853 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:31:08 +0100
Subject: [PATCH 29/89] Update README.md

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 2fd89863..ef950bf1 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,8 @@ local-ai run oci://localai/phi-2:latest
 
 ## 📰 Latest project news
 
+- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
+- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)

From 87b7648591573ea59c090b1095ba3073623933ad Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:35:54 +0100
Subject: [PATCH 30/89] chore(ci): set auto-labeler for dependencies

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 .github/labeler.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 687a90d1..23e64566 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,6 +1,11 @@
 enhancements:
  - head-branch: ['^feature', 'feature']
 
+dependencies:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'Makefile'
+
 kind/documentation:
 - any:
   - changed-files:

From 4a079f893ca28217c86b9659a70d7ad33b8bf6fa Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 4 Dec 2024 23:19:35 +0100
Subject: [PATCH 31/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `59f4db10883a4f3e855cffbf2c3ab68430e95272` (#4319)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index dad02937..3a4a2d3e 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=cc98896db858df7aa40d0e16a505883ef196a482
+CPPLLAMA_VERSION?=59f4db10883a4f3e855cffbf2c3ab68430e95272
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From c2261495038617fecc3b9cd8431a3478ff2bc9b8 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 5 Dec 2024 09:09:11 +0100
Subject: [PATCH 32/89] chore: :arrow_up: Update leejet/stable-diffusion.cpp to
 `9578fdcc4632dc3de5565f28e2fb16b7c18f8d48` (#4320)

:arrow_up: Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3a4a2d3e..c1a6cbcd 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ BARKCPP_VERSION?=v1.0.0
 
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=4570715727f35e5a07a76796d823824c8f42206c
+STABLEDIFFUSION_GGML_VERSION?=9578fdcc4632dc3de5565f28e2fb16b7c18f8d48
 
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64

From ab0f8648a33aa0c4453d67d989b530dab6fae477 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:01:49 +0100
Subject: [PATCH 33/89] chore(model gallery): add rp-naughty-v1.0c-8b (#4322)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 02a58d22..b1269741 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -7891,6 +7891,26 @@
     - filename: Meta-Llama-3-8B-Instruct-exp5-11-Q4_K_M.gguf
       sha256: 5dd81b8b809667d10036499affdd1461cf95af50b405cbc9f800b421a4b60e98
       uri: huggingface://DavidAU/Meta-Llama-3-Instruct-8.9B-BRAINSTORM-5x-FORM-11-GGUF/Meta-Llama-3-8B-Instruct-exp5-11-Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "rp-naughty-v1.0c-8b"
+  urls:
+    - https://huggingface.co/QuantFactory/RP-Naughty-v1.0c-8b-GGUF
+  description: |
+    This model was merged using the Model Stock merge method using aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K as a base.
+    The following models were included in the merge:
+
+        underwoods/adventure-8b
+        Khetterman/Multilingual-SaigaSuzume-8B
+        underwoods/writer-8b
+        Khetterman/Kosmos-8B-v1
+        Khetterman/CursedMatrix-8B-v9
+  overrides:
+    parameters:
+      model: RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+  files:
+    - filename: RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+      sha256: c344564d26d0c3d244d31cfeb103666eab37f9dee6678a2dbaf5bfcf4109d789
+      uri: huggingface://QuantFactory/RP-Naughty-v1.0c-8b-GGUF/RP-Naughty-v1.0c-8b.Q4_K_M.gguf
 - &command-R
   ### START Command-r
   url: "github:mudler/LocalAI/gallery/command-r.yaml@master"

From be907d993f95f1144271d2847e3c176314d00c68 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:02:02 +0100
Subject: [PATCH 34/89] chore(model gallery): add loki-v2.6-8b-1024k (#4321)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 133 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b1269741..cc40048c 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3420,6 +3420,139 @@
     - filename: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
       sha256: c481e7089ffaedd5ae8c74dccc7fb45f6509640b661fa086ae979f6fefc3fdba
       uri: huggingface://QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF/Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "loki-v2.6-8b-1024k"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6472de046facfb01d8b1fb9d/uQPITKRS8XLTLyaiGwgh_.jpeg
+  urls:
+    - https://huggingface.co/QuantFactory/Loki-v2.6-8b-1024k-GGUF
+  description: |
+    The following models were included in the merge:
+    MrRobotoAI/Epic_Fiction-8b
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k
+    MrRobotoAI/Loki-.Epic_Fiction.-8b
+    Casual-Autopsy/L3-Luna-8B
+    Casual-Autopsy/L3-Super-Nova-RP-8B
+    Casual-Autopsy/L3-Umbral-Mind-RP-v3.0-8B
+    Casual-Autopsy/Halu-L3-Stheno-BlackOasis-8B
+    Undi95/Llama-3-LewdPlay-8B
+    Undi95/Llama-3-LewdPlay-8B-evo
+    Undi95/Llama-3-Unholy-8B
+    ChaoticNeutrals/Hathor_Tahsin-L3-8B-v0.9
+    ChaoticNeutrals/Hathor_RP-v.01-L3-8B
+    ChaoticNeutrals/Domain-Fusion-L3-8B
+    ChaoticNeutrals/T-900-8B
+    ChaoticNeutrals/Poppy_Porpoise-1.4-L3-8B
+    ChaoticNeutrals/Templar_v1_8B
+    ChaoticNeutrals/Hathor_Respawn-L3-8B-v0.8
+    ChaoticNeutrals/Sekhmet_Gimmel-L3.1-8B-v0.3
+    zeroblu3/LewdPoppy-8B-RP
+    tohur/natsumura-storytelling-rp-1.0-llama-3.1-8b
+    jeiku/Chaos_RP_l3_8B
+    tannedbum/L3-Nymeria-Maid-8B
+    Nekochu/Luminia-8B-RP
+    vicgalle/Humanish-Roleplay-Llama-3.1-8B
+    saishf/SOVLish-Maid-L3-8B
+    Dogge/llama-3-8B-instruct-Bluemoon-Freedom-RP
+    MrRobotoAI/Epic_Fiction-8b-v4
+    maldv/badger-lambda-0-llama-3-8b
+    maldv/llama-3-fantasy-writer-8b
+    maldv/badger-kappa-llama-3-8b
+    maldv/badger-mu-llama-3-8b
+    maldv/badger-lambda-llama-3-8b
+    maldv/badger-iota-llama-3-8b
+    maldv/badger-writer-llama-3-8b
+    Magpie-Align/MagpieLM-8B-Chat-v0.1
+    nbeerbower/llama-3-gutenberg-8B
+    nothingiisreal/L3-8B-Stheno-Horny-v3.3-32K
+    nbeerbower/llama-3-spicy-abliterated-stella-8B
+    Magpie-Align/MagpieLM-8B-SFT-v0.1
+    NeverSleep/Llama-3-Lumimaid-8B-v0.1
+    mlabonne/NeuralDaredevil-8B-abliterated
+    mlabonne/Daredevil-8B-abliterated
+    NeverSleep/Llama-3-Lumimaid-8B-v0.1-OAS
+    nothingiisreal/L3-8B-Instruct-Abliterated-DWP
+    openchat/openchat-3.6-8b-20240522
+    turboderp/llama3-turbcat-instruct-8b
+    UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3
+    Undi95/Llama-3-LewdPlay-8B
+    TIGER-Lab/MAmmoTH2-8B-Plus
+    OwenArli/Awanllm-Llama-3-8B-Cumulus-v1.0
+    refuelai/Llama-3-Refueled
+    SicariusSicariiStuff/LLAMA-3_8B_Unaligned_Alpha
+    NousResearch/Hermes-2-Theta-Llama-3-8B
+    ResplendentAI/Nymph_8B
+    grimjim/Llama-3-Oasis-v1-OAS-8B
+    flammenai/Mahou-1.3b-llama3-8B
+    lemon07r/Llama-3-RedMagic4-8B
+    grimjim/Llama-3.1-SuperNova-Lite-lorabilterated-8B
+    grimjim/Llama-Nephilim-Metamorphosis-v2-8B
+    lemon07r/Lllama-3-RedElixir-8B
+    grimjim/Llama-3-Perky-Pat-Instruct-8B
+    ChaoticNeutrals/Hathor_RP-v.01-L3-8B
+    grimjim/llama-3-Nephilim-v2.1-8B
+    ChaoticNeutrals/Hathor_Respawn-L3-8B-v0.8
+    migtissera/Llama-3-8B-Synthia-v3.5
+    Locutusque/Llama-3-Hercules-5.0-8B
+    WhiteRabbitNeo/Llama-3-WhiteRabbitNeo-8B-v2.0
+    VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct
+    iRyanBell/ARC1-II
+    HPAI-BSC/Llama3-Aloe-8B-Alpha
+    HaitameLaf/Llama-3-8B-StoryGenerator
+    failspy/Meta-Llama-3-8B-Instruct-abliterated-v3
+    Undi95/Llama-3-Unholy-8B
+    ajibawa-2023/Uncensored-Frank-Llama-3-8B
+    ajibawa-2023/SlimOrca-Llama-3-8B
+    ChaoticNeutrals/Templar_v1_8B
+    aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K
+    ChaoticNeutrals/Hathor_Tahsin-L3-8B-v0.9
+    Blackroot/Llama-3-Gamma-Twist
+    FPHam/L3-8B-Everything-COT
+    Blackroot/Llama-3-LongStory
+    ChaoticNeutrals/Sekhmet_Gimmel-L3.1-8B-v0.3
+    abacusai/Llama-3-Smaug-8B
+    Khetterman/CursedMatrix-8B-v9
+    ajibawa-2023/Scarlett-Llama-3-8B-v1.0
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/physics_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/electrical_engineering
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_chemistry
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_physics
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/formal_logic
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy_100
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/conceptual_physics
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_computer_science
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/psychology_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/psychology
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Blackroot/Llama3-RP-Lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LimaRP-Instruct-LoRA-8B
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + nothingiisreal/llama3-8B-DWP-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/world_religions
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/high_school_european_history
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/electrical_engineering
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-8B-Abomination-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/human_sexuality
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/sociology
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Theory_of_Mind_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Smarts_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Nimue-8B
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + vincentyandex/lora_llama3_chunked_novel_bs128
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Aura_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/L3-Daybreak-8b-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Luna_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + nicce/story-mixtral-8x7b-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Blackroot/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/NoWarning_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/BlueMoon_Llama3
+  overrides:
+    parameters:
+      model: Loki-v2.6-8b-1024k.Q4_K_M.gguf
+  files:
+    - filename: Loki-v2.6-8b-1024k.Q4_K_M.gguf
+      sha256: 9b15c1fee0a0e6d6ed97df3d1b6fc8f774e6e1bd388328599e731c62e0f19d81
+      uri: huggingface://QuantFactory/Loki-v2.6-8b-1024k-GGUF/Loki-v2.6-8b-1024k.Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From b86a3e4fa69513cc876dac327e676740306f16dc Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:05:35 +0100
Subject: [PATCH 35/89] chore(model gallery): add math-iio-7b-instruct (#4323)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index cc40048c..e8e9c9fc 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1791,6 +1791,30 @@
     - filename: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
       sha256: a356f279a104bff0bbc2ef7ec136c1e774153de8893bf988083e96fb7f4bc053
       uri: huggingface://QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF/HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "math-iio-7b-instruct"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/faLfR-doaWP_BLUkOQrbq.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Math-IIO-7B-Instruct
+    - https://huggingface.co/QuantFactory/Math-IIO-7B-Instruct-GGUF
+  description: |
+    The Math IIO 7B Instruct is a fine-tuned language model based on the robust Qwen2.5-7B-Instruct architecture. This model has been specifically trained to excel in single-shot mathematical reasoning and instruction-based tasks, making it a reliable choice for educational, analytical, and problem-solving applications.
+    Key Features:
+      Math-Optimized Capabilities:
+      The model is designed to handle complex mathematical problems, step-by-step calculations, and reasoning tasks.
+
+      Instruction-Tuned:
+      Fine-tuned for better adherence to structured queries and task-oriented prompts, enabling clear and concise outputs.
+
+      Large Vocabulary:
+      Equipped with an extensive tokenizer configuration and custom tokens to ensure precise mathematical notation support.
+  overrides:
+    parameters:
+      model: Math-IIO-7B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Math-IIO-7B-Instruct.Q4_K_M.gguf
+      sha256: 8ffda0b6a43eb9997dfd7db48fe3bd0970fd1b9b86fb68f082c38622a48b58f4
+      uri: huggingface://QuantFactory/Math-IIO-7B-Instruct-GGUF/Math-IIO-7B-Instruct.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From b90d78d9f6438cd8c90fd901b45539a4b410c264 Mon Sep 17 00:00:00 2001
From: PetrFlegr <ptrflegr@gmail.com>
Date: Thu, 5 Dec 2024 16:06:51 +0100
Subject: [PATCH 36/89] Updated links of yamls (#4324)

Updated links

Links to deplyment*.yaml was changed

Signed-off-by: PetrFlegr <ptrflegr@gmail.com>
---
 docs/content/docs/getting-started/kubernetes.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/content/docs/getting-started/kubernetes.md b/docs/content/docs/getting-started/kubernetes.md
index fb08b046..aea28f3e 100644
--- a/docs/content/docs/getting-started/kubernetes.md
+++ b/docs/content/docs/getting-started/kubernetes.md
@@ -10,13 +10,13 @@ ico = "rocket_launch"
 For installing LocalAI in Kubernetes, the deployment file from the `examples` can be used and customized as prefered:
 
 ```
-kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI/master/examples/kubernetes/deployment.yaml
+kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI-examples/refs/heads/main/kubernetes/deployment.yaml
 ```
 
 For Nvidia GPUs:
 
 ```
-kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI/master/examples/kubernetes/deployment-nvidia.yaml
+kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI-examples/refs/heads/main/kubernetes/deployment-nvidia.yaml
 ```
 
 Alternatively, the [helm chart](https://github.com/go-skynet/helm-charts) can be used as well:

From 3127cd135279f40926d4375bdf3e940b789e2734 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 5 Dec 2024 16:57:56 +0100
Subject: [PATCH 37/89] chore(docs): update available backends (#4325)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .../docs/reference/compatibility-table.md     | 24 ++++++++-----------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/docs/content/docs/reference/compatibility-table.md b/docs/content/docs/reference/compatibility-table.md
index f76ad85d..c3bf2660 100644
--- a/docs/content/docs/reference/compatibility-table.md
+++ b/docs/content/docs/reference/compatibility-table.md
@@ -6,7 +6,7 @@ weight = 24
 url = "/model-compatibility/"
 +++
 
-Besides llama based models, LocalAI is compatible also with other architectures. The table below lists all the compatible models families and the associated binding repository.
+Besides llama based models, LocalAI is compatible also with other architectures. The table below lists all the backends, compatible models families and the associated repository.
 
 {{% alert note %}}
 
@@ -16,19 +16,8 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 
 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | Vicuna, Alpaca, LLaMa, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
-| [gpt4all-llama](https://github.com/nomic-ai/gpt4all)      | Vicuna, Alpaca, LLaMa | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [gpt4all-mpt](https://github.com/nomic-ai/gpt4all)          | MPT                   | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [gpt4all-j](https://github.com/nomic-ai/gpt4all)           | GPT4ALL-J             | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [falcon-ggml](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | Falcon (*)             | yes                      | GPT                        | no                                | no                   | N/A |
-| [dolly](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))            | Dolly                 | yes                      | GPT                        | no                                | no                   | N/A |
-| [gptj](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | GPTJ             | yes                      | GPT                        | no                                | no                   | N/A |
-| [mpt](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))         | MPT     | yes                      | GPT                        | no                                | no                   | N/A |
-| [replit](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | Replit             | yes                      | GPT                        | no                                | no                   | N/A |
-| [gptneox](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | GPT NeoX, RedPajama, StableLM             | yes                      | GPT                        | no                                | no                   | N/A |
-| [bloomz](https://github.com/NouamaneTazi/bloomz.cpp) ([binding](https://github.com/go-skynet/bloomz.cpp))       | Bloom                 | yes                      | GPT                        | no                                | no                   | N/A |
-| [rwkv](https://github.com/saharNooby/rwkv.cpp) ([binding](https://github.com/donomii/go-rwkv.cpp))       | rwkv                 | yes                      | GPT                        | no                                | yes                   | N/A  |
-| [bert](https://github.com/skeskinen/bert.cpp) ([binding](https://github.com/go-skynet/go-bert.cpp)) | bert                  | no                       | Embeddings only                  | yes                               | no                   | N/A |
+| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
 | [whisper](https://github.com/ggerganov/whisper.cpp)         | whisper               | no                       | Audio                 | no                                | no                   | N/A |
 | [stablediffusion](https://github.com/EdVince/Stable-Diffusion-NCNN) ([binding](https://github.com/mudler/go-stable-diffusion))        | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
@@ -40,11 +29,18 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | `diffusers`  | SD,...                   | no                       | Image generation    | no                               | no                   | N/A |
 | `vall-e-x` | Vall-E    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | `vllm` | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CPU/CUDA |
+| `mamba` | Mamba models architecture | yes                      | GPT             | no | no                  | CPU/CUDA |
 | `exllama2`  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | N/A |
 | `transformers-musicgen`  |                    | no                       | Audio generation                | no                               | no                   | N/A |
 | [tinydream](https://github.com/symisc/tiny-dream#tiny-dreaman-embedded-header-only-stable-diffusion-inference-c-librarypixlabiotiny-dream)         | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | `coqui` | Coqui    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| `openvoice` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| `parler-tts` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CPU/CUDA |
 | `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes****                  | CPU/CUDA/XPU |
+| [bark-cpp](https://github.com/PABannier/bark.cpp)        | bark               | no                       | Audio-Only                 | no                                | no                   | yes |
+| [stablediffusion-cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | N/A |
+| [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |
 
 Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "docs/advanced" %}})).
 

From ba225f660b532e9f51366660b17405913392466a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 5 Dec 2024 22:54:00 +0100
Subject: [PATCH 38/89] docs: :arrow_up: update docs version mudler/LocalAI
 (#4327)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index 20611657..bb7517a1 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.23.0"
+  "version": "v2.24.0"
 }

From 88737e1d760e5d6466f98d1d692d6589dcd1ca7a Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:15:21 +0100
Subject: [PATCH 39/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `c9c6e01daedac542b174c235872569fce5385982` (#4328)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c1a6cbcd..225189ad 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=59f4db10883a4f3e855cffbf2c3ab68430e95272
+CPPLLAMA_VERSION?=c9c6e01daedac542b174c235872569fce5385982
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From d4c1746c7db3d13ba97bb9d8a8b698d8a366a0a7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:23:59 +0100
Subject: [PATCH 40/89] feat(llama.cpp): expose cache_type_k and cache_type_v
 for quant of kv cache (#4329)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/backend.proto             | 3 +++
 backend/cpp/llama/grpc-server.cpp | 6 ++++++
 core/backend/options.go           | 2 ++
 core/config/backend_config.go     | 6 ++++--
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/backend/backend.proto b/backend/backend.proto
index 48b0101b..0a341ca2 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -242,6 +242,9 @@ message ModelOptions {
   repeated float LoraScales = 61;
 
   repeated string Options = 62;
+
+  string CacheTypeKey = 63;
+  string CacheTypeValue = 64;
 }
 
 message Result {
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index 0fde74cb..ea5c4e34 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2241,6 +2241,12 @@ static void params_parse(const backend::ModelOptions* request,
     }
     //  params.model_alias ??
     params.model_alias =  request->modelfile();
+    if (!request->cachetypekey().empty()) {
+        params.cache_type_k = request->cachetypekey();
+    }
+    if (!request->cachetypevalue().empty()) {
+        params.cache_type_v = request->cachetypevalue();
+    }
     params.n_ctx = request->contextsize();
     //params.memory_f16 = request->f16memory();
     params.cpuparams.n_threads = request->threads();
diff --git a/core/backend/options.go b/core/backend/options.go
index 1f88122f..f6247c60 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
+		CacheTypeKey:         c.CacheTypeK,
+		CacheTypeValue:       c.CacheTypeV,
 		NoKVOffload:          c.NoKVOffloading,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 1de540f9..0ff34769 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -155,8 +155,10 @@ type LLMConfig struct {
 	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
 	MMProj               string    `yaml:"mmproj"`
 
-	FlashAttention bool `yaml:"flash_attention"`
-	NoKVOffloading bool `yaml:"no_kv_offloading"`
+	FlashAttention bool   `yaml:"flash_attention"`
+	NoKVOffloading bool   `yaml:"no_kv_offloading"`
+	CacheTypeK     string `yaml:"cache_type_k"`
+	CacheTypeV     string `yaml:"cache_type_v"`
 
 	RopeScaling string `yaml:"rope_scaling"`
 	ModelType   string `yaml:"type"`

From 5592f5e8206ba0bc8c2a00f760cde1f7b1da2c08 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 6 Dec 2024 22:46:51 +0100
Subject: [PATCH 41/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `c5ede3849fc021174862f9c0bf8273808d8f0d39` (#4330)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 225189ad..1ab621cd 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=c9c6e01daedac542b174c235872569fce5385982
+CPPLLAMA_VERSION?=c5ede3849fc021174862f9c0bf8273808d8f0d39
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 7184ca546fc553874441e789ff466de69b4e2b93 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 7 Dec 2024 10:39:20 +0100
Subject: [PATCH 42/89] chore(model gallery): add llama-3.3-70b-instruct
 (#4333)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e8e9c9fc..c94358b6 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,27 @@
 ---
+- &llama33
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
+  license: llama3.3
+  description: |
+    The Meta Llama 3.3 multilingual large language model (LLM) is a pretrained and instruction tuned generative model in 70B (text in/text out). The Llama 3.3 instruction tuned text only model is optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - llama3.3
+  name: "llama-3.3-70b-instruct"
+  urls:
+    - https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+    - https://huggingface.co/MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Llama-3.3-70B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Llama-3.3-70B-Instruct.Q4_K_M.gguf
+      sha256: 4f3b04ecae278bdb0fd545b47c210bc5edf823e5ebf7d41e0b526c81d54b1ff3
+      uri: huggingface://MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct.Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From f5e1527a5accbab3af6a69a0cbf085ff5e61a8c6 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 7 Dec 2024 22:51:45 +0100
Subject: [PATCH 43/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `3573fa8e7b7f0865638b52b4e9b4d2006f0558a2` (#4335)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1ab621cd..786de811 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=c5ede3849fc021174862f9c0bf8273808d8f0d39
+CPPLLAMA_VERSION?=3573fa8e7b7f0865638b52b4e9b4d2006f0558a2
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From cea5a0ea42348f64b982ef7fb64796a86d2bd70e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 8 Dec 2024 13:50:33 +0100
Subject: [PATCH 44/89] feat(template): read jinja templates from gguf files
 (#4332)

* Read jinja templates as fallback

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Move templating out of model loader

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Test TemplateMessages

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Set role and content from transformers

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Tests: be more flexible

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* More jinja

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Small refactoring and adaptations

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/application.go                           |  38 ---
 core/application/application.go               |  39 +++
 .../config_file_watcher.go                    |   4 +-
 core/{startup => application}/startup.go      |  77 ++---
 core/cli/run.go                               |   8 +-
 core/config/backend_config.go                 |   2 +
 core/config/guesser.go                        |  16 +-
 core/http/app.go                              |  73 +++--
 core/http/app_test.go                         |  24 +-
 core/http/endpoints/openai/chat.go            | 146 +--------
 core/http/endpoints/openai/completion.go      |  47 +--
 core/http/endpoints/openai/edit.go            |  33 +-
 core/http/routes/localai.go                   |  48 +--
 core/http/routes/openai.go                    | 154 ++++++---
 go.mod                                        |   5 +
 go.sum                                        |  12 +
 pkg/model/loader.go                           |   4 -
 pkg/model/template.go                         |  52 ---
 pkg/model/template_test.go                    | 197 ------------
 pkg/templates/cache.go                        | 156 ++++++---
 pkg/templates/cache_test.go                   |  73 -----
 pkg/templates/evaluator.go                    | 295 ++++++++++++++++++
 pkg/templates/evaluator_test.go               | 253 +++++++++++++++
 23 files changed, 971 insertions(+), 785 deletions(-)
 delete mode 100644 core/application.go
 create mode 100644 core/application/application.go
 rename core/{startup => application}/config_file_watcher.go (96%)
 rename core/{startup => application}/startup.go (62%)
 delete mode 100644 pkg/model/template.go
 delete mode 100644 pkg/model/template_test.go
 delete mode 100644 pkg/templates/cache_test.go
 create mode 100644 pkg/templates/evaluator.go
 create mode 100644 pkg/templates/evaluator_test.go

diff --git a/core/application.go b/core/application.go
deleted file mode 100644
index e4efbdd0..00000000
--- a/core/application.go
+++ /dev/null
@@ -1,38 +0,0 @@
-package core
-
-import (
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/model"
-)
-
-// The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
-// Perhaps a proper DI system is worth it in the future, but for now keep things simple.
-type Application struct {
-
-	// Application-Level Config
-	ApplicationConfig *config.ApplicationConfig
-	// ApplicationState *ApplicationState
-
-	// Core Low-Level Services
-	BackendConfigLoader *config.BackendConfigLoader
-	ModelLoader         *model.ModelLoader
-
-	// Backend Services
-	// EmbeddingsBackendService      *backend.EmbeddingsBackendService
-	// ImageGenerationBackendService *backend.ImageGenerationBackendService
-	// LLMBackendService             *backend.LLMBackendService
-	// TranscriptionBackendService *backend.TranscriptionBackendService
-	// TextToSpeechBackendService  *backend.TextToSpeechBackendService
-
-	// LocalAI System Services
-	BackendMonitorService *services.BackendMonitorService
-	GalleryService        *services.GalleryService
-	LocalAIMetricsService *services.LocalAIMetricsService
-	// OpenAIService         *services.OpenAIService
-}
-
-// TODO [NEXT PR?]: Break up ApplicationConfig.
-// Migrate over stuff that is not set via config at all - especially runtime stuff
-type ApplicationState struct {
-}
diff --git a/core/application/application.go b/core/application/application.go
new file mode 100644
index 00000000..6e8d6204
--- /dev/null
+++ b/core/application/application.go
@@ -0,0 +1,39 @@
+package application
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
+)
+
+type Application struct {
+	backendLoader      *config.BackendConfigLoader
+	modelLoader        *model.ModelLoader
+	applicationConfig  *config.ApplicationConfig
+	templatesEvaluator *templates.Evaluator
+}
+
+func newApplication(appConfig *config.ApplicationConfig) *Application {
+	return &Application{
+		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		applicationConfig:  appConfig,
+		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
+	}
+}
+
+func (a *Application) BackendLoader() *config.BackendConfigLoader {
+	return a.backendLoader
+}
+
+func (a *Application) ModelLoader() *model.ModelLoader {
+	return a.modelLoader
+}
+
+func (a *Application) ApplicationConfig() *config.ApplicationConfig {
+	return a.applicationConfig
+}
+
+func (a *Application) TemplatesEvaluator() *templates.Evaluator {
+	return a.templatesEvaluator
+}
diff --git a/core/startup/config_file_watcher.go b/core/application/config_file_watcher.go
similarity index 96%
rename from core/startup/config_file_watcher.go
rename to core/application/config_file_watcher.go
index df72483f..46f29b10 100644
--- a/core/startup/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -1,4 +1,4 @@
-package startup
+package application
 
 import (
 	"encoding/json"
@@ -8,8 +8,8 @@ import (
 	"path/filepath"
 	"time"
 
-	"github.com/fsnotify/fsnotify"
 	"dario.cat/mergo"
+	"github.com/fsnotify/fsnotify"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/rs/zerolog/log"
 )
diff --git a/core/startup/startup.go b/core/application/startup.go
similarity index 62%
rename from core/startup/startup.go
rename to core/application/startup.go
index 0eb5fa58..cd52d37a 100644
--- a/core/startup/startup.go
+++ b/core/application/startup.go
@@ -1,15 +1,15 @@
-package startup
+package application
 
 import (
 	"fmt"
 	"os"
 
-	"github.com/mudler/LocalAI/core"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/assets"
+
 	"github.com/mudler/LocalAI/pkg/library"
 	"github.com/mudler/LocalAI/pkg/model"
 	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
@@ -17,8 +17,9 @@ import (
 	"github.com/rs/zerolog/log"
 )
 
-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+func New(opts ...config.AppOption) (*Application, error) {
 	options := config.NewApplicationConfig(opts...)
+	application := newApplication(options)
 
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
@@ -36,28 +37,28 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	// Make sure directories exists
 	if options.ModelPath == "" {
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+		return nil, fmt.Errorf("options.ModelPath cannot be empty")
 	}
 	err = os.MkdirAll(options.ModelPath, 0750)
 	if err != nil {
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	if options.ImageDir != "" {
 		err := os.MkdirAll(options.ImageDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.AudioDir != "" {
 		err := os.MkdirAll(options.AudioDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
 		err := os.MkdirAll(options.UploadDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+			return nil, fmt.Errorf("unable to create UploadDir: %q", err)
 		}
 	}
 
@@ -65,39 +66,36 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 		log.Error().Err(err).Msg("error installing models")
 	}
 
-	cl := config.NewBackendConfigLoader(options.ModelPath)
-	ml := model.NewModelLoader(options.ModelPath)
-
 	configLoaderOpts := options.ToConfigLoaderOptions()
 
-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+	if err := application.BackendLoader().LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
 		log.Error().Err(err).Msg("error loading config files")
 	}
 
 	if options.ConfigFile != "" {
-		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
+		if err := application.BackendLoader().LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
 			log.Error().Err(err).Msg("error loading config file")
 		}
 	}
 
-	if err := cl.Preload(options.ModelPath); err != nil {
+	if err := application.BackendLoader().Preload(options.ModelPath); err != nil {
 		log.Error().Err(err).Msg("error downloading models")
 	}
 
 	if options.PreloadJSONModels != "" {
 		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
+			return nil, err
 		}
 	}
 
 	if options.PreloadModelsFromPath != "" {
 		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
+			return nil, err
 		}
 	}
 
 	if options.Debug {
-		for _, v := range cl.GetAllBackendConfigs() {
+		for _, v := range application.BackendLoader().GetAllBackendConfigs() {
 			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
 		}
 	}
@@ -123,7 +121,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	go func() {
 		<-options.Context.Done()
 		log.Debug().Msgf("Context canceled, shutting down")
-		err := ml.StopAllGRPC()
+		err := application.ModelLoader().StopAllGRPC()
 		if err != nil {
 			log.Error().Err(err).Msg("error while stopping all grpc backends")
 		}
@@ -131,12 +129,12 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	if options.WatchDog {
 		wd := model.NewWatchDog(
-			ml,
+			application.ModelLoader(),
 			options.WatchDogBusyTimeout,
 			options.WatchDogIdleTimeout,
 			options.WatchDogBusy,
 			options.WatchDogIdle)
-		ml.SetWatchDog(wd)
+		application.ModelLoader().SetWatchDog(wd)
 		go wd.Run()
 		go func() {
 			<-options.Context.Done()
@@ -147,7 +145,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
-			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
+			cfg, err := application.BackendLoader().LoadBackendConfigFileByName(m, options.ModelPath,
 				config.LoadOptionDebug(options.Debug),
 				config.LoadOptionThreads(options.Threads),
 				config.LoadOptionContextSize(options.ContextSize),
@@ -155,7 +153,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 				config.ModelPath(options.ModelPath),
 			)
 			if err != nil {
-				return nil, nil, nil, err
+				return nil, err
 			}
 
 			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
@@ -163,9 +161,9 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 			o := backend.ModelOptions(*cfg, options)
 
 			var backendErr error
-			_, backendErr = ml.Load(o...)
+			_, backendErr = application.ModelLoader().Load(o...)
 			if backendErr != nil {
-				return nil, nil, nil, err
+				return nil, err
 			}
 		}
 	}
@@ -174,7 +172,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	startWatcher(options)
 
 	log.Info().Msg("core/startup process completed!")
-	return cl, ml, options, nil
+	return application, nil
 }
 
 func startWatcher(options *config.ApplicationConfig) {
@@ -201,32 +199,3 @@ func startWatcher(options *config.ApplicationConfig) {
 		log.Error().Err(err).Msg("failed creating watcher")
 	}
 }
-
-// In Lieu of a proper DI framework, this function wires up the Application manually.
-// This is in core/startup rather than core/state.go to keep package references clean!
-func createApplication(appConfig *config.ApplicationConfig) *core.Application {
-	app := &core.Application{
-		ApplicationConfig:   appConfig,
-		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
-		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
-	}
-
-	var err error
-
-	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-
-	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
-	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
-
-	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
-	if err != nil {
-		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
-	}
-
-	return app
-}
diff --git a/core/cli/run.go b/core/cli/run.go
index b2d439a0..a0e16155 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -6,12 +6,12 @@ import (
 	"strings"
 	"time"
 
+	"github.com/mudler/LocalAI/core/application"
 	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/p2p"
-	"github.com/mudler/LocalAI/core/startup"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -186,16 +186,16 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	}
 
 	if r.PreloadBackendOnly {
-		_, _, _, err := startup.Startup(opts...)
+		_, err := application.New(opts...)
 		return err
 	}
 
-	cl, ml, options, err := startup.Startup(opts...)
+	app, err := application.New(opts...)
 	if err != nil {
 		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}
 
-	appHTTP, err := http.App(cl, ml, options)
+	appHTTP, err := http.API(app)
 	if err != nil {
 		log.Error().Err(err).Msg("error during HTTP App construction")
 		return err
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 0ff34769..f07ec3d3 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -206,6 +206,8 @@ type TemplateConfig struct {
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 
 	Multimodal string `yaml:"multimodal"`
+
+	JinjaTemplate bool `yaml:"jinja_template"`
 }
 
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
diff --git a/core/config/guesser.go b/core/config/guesser.go
index b63dd051..f5627461 100644
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -26,14 +26,14 @@ const (
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
-	RepeatPenalty float64
+	RepeatPenalty  float64
 }
 
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
-		StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
@@ -200,6 +200,18 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
 }
 
 func identifyFamily(f *gguf.GGUFFile) familyType {
diff --git a/core/http/app.go b/core/http/app.go
index 2ba2c2b9..a2d8b87a 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -14,10 +14,9 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
 
-	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/gofiber/contrib/fiberzerolog"
 	"github.com/gofiber/fiber/v2"
@@ -49,18 +48,18 @@ var embedDirStatic embed.FS
 // @in header
 // @name Authorization
 
-func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
+func API(application *application.Application) (*fiber.App, error) {
 
 	fiberCfg := fiber.Config{
 		Views:     renderEngine(),
-		BodyLimit: appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		BodyLimit: application.ApplicationConfig().UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
 		// We disable the Fiber startup message as it does not conform to structured logging.
 		// We register a startup log line with connection information in the OnListen hook to keep things user friendly though
 		DisableStartupMessage: true,
 		// Override default error handler
 	}
 
-	if !appConfig.OpaqueErrors {
+	if !application.ApplicationConfig().OpaqueErrors {
 		// Normally, return errors as JSON responses
 		fiberCfg.ErrorHandler = func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -86,9 +85,9 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		}
 	}
 
-	app := fiber.New(fiberCfg)
+	router := fiber.New(fiberCfg)
 
-	app.Hooks().OnListen(func(listenData fiber.ListenData) error {
+	router.Hooks().OnListen(func(listenData fiber.ListenData) error {
 		scheme := "http"
 		if listenData.TLS {
 			scheme = "https"
@@ -99,82 +98,82 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 
 	// Have Fiber use zerolog like the rest of the application rather than it's built-in logger
 	logger := log.Logger
-	app.Use(fiberzerolog.New(fiberzerolog.Config{
+	router.Use(fiberzerolog.New(fiberzerolog.Config{
 		Logger: &logger,
 	}))
 
 	// Default middleware config
 
-	if !appConfig.Debug {
-		app.Use(recover.New())
+	if !application.ApplicationConfig().Debug {
+		router.Use(recover.New())
 	}
 
-	if !appConfig.DisableMetrics {
+	if !application.ApplicationConfig().DisableMetrics {
 		metricsService, err := services.NewLocalAIMetricsService()
 		if err != nil {
 			return nil, err
 		}
 
 		if metricsService != nil {
-			app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
-			app.Hooks().OnShutdown(func() error {
+			router.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+			router.Hooks().OnShutdown(func() error {
 				return metricsService.Shutdown()
 			})
 		}
 
 	}
 	// Health Checks should always be exempt from auth, so register these first
-	routes.HealthRoutes(app)
+	routes.HealthRoutes(router)
 
-	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
+	kaConfig, err := middleware.GetKeyAuthConfig(application.ApplicationConfig())
 	if err != nil || kaConfig == nil {
 		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}
 
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
-	app.Use(v2keyauth.New(*kaConfig))
+	router.Use(v2keyauth.New(*kaConfig))
 
-	if appConfig.CORS {
+	if application.ApplicationConfig().CORS {
 		var c func(ctx *fiber.Ctx) error
-		if appConfig.CORSAllowOrigins == "" {
+		if application.ApplicationConfig().CORSAllowOrigins == "" {
 			c = cors.New()
 		} else {
-			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
+			c = cors.New(cors.Config{AllowOrigins: application.ApplicationConfig().CORSAllowOrigins})
 		}
 
-		app.Use(c)
+		router.Use(c)
 	}
 
-	if appConfig.CSRF {
+	if application.ApplicationConfig().CSRF {
 		log.Debug().Msg("Enabling CSRF middleware. Tokens are now required for state-modifying requests")
-		app.Use(csrf.New())
+		router.Use(csrf.New())
 	}
 
 	// Load config jsons
-	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
+	utils.LoadConfig(application.ApplicationConfig().UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
+	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
+	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
 
-	galleryService := services.NewGalleryService(appConfig)
-	galleryService.Start(appConfig.Context, cl)
+	galleryService := services.NewGalleryService(application.ApplicationConfig())
+	galleryService.Start(application.ApplicationConfig().Context, application.BackendLoader())
 
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
-	if !appConfig.DisableWebUI {
-		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
+	routes.RegisterElevenLabsRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
+	routes.RegisterLocalAIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
+	routes.RegisterOpenAIRoutes(router, application)
+	if !application.ApplicationConfig().DisableWebUI {
+		routes.RegisterUIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig)
+	routes.RegisterJINARoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
 
 	httpFS := http.FS(embedDirStatic)
 
-	app.Use(favicon.New(favicon.Config{
+	router.Use(favicon.New(favicon.Config{
 		URL:        "/favicon.ico",
 		FileSystem: httpFS,
 		File:       "static/favicon.ico",
 	}))
 
-	app.Use("/static", filesystem.New(filesystem.Config{
+	router.Use("/static", filesystem.New(filesystem.Config{
 		Root:       httpFS,
 		PathPrefix: "static",
 		Browse:     true,
@@ -182,7 +181,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 
 	// Define a custom 404 handler
 	// Note: keep this at the bottom!
-	app.Use(notFoundHandler)
+	router.Use(notFoundHandler)
 
-	return app, nil
+	return router, nil
 }
diff --git a/core/http/app_test.go b/core/http/app_test.go
index 83fb0e73..34ebacf7 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -12,15 +12,14 @@ import (
 	"path/filepath"
 	"runtime"
 
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/startup"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
-	"github.com/mudler/LocalAI/pkg/model"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"gopkg.in/yaml.v3"
@@ -252,9 +251,6 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
-	var bcl *config.BackendConfigLoader
-	var ml *model.ModelLoader
-	var applicationConfig *config.ApplicationConfig
 
 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
@@ -300,7 +296,7 @@ var _ = Describe("API test", func() {
 				},
 			}
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithGalleries(galleries),
@@ -310,7 +306,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())
 
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
@@ -539,7 +535,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
+				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 
@@ -641,7 +637,7 @@ var _ = Describe("API test", func() {
 				},
 			}
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithAudioDir(tmpdir),
@@ -652,7 +648,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
@@ -772,14 +768,14 @@ var _ = Describe("API test", func() {
 
 			var err error
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
 
@@ -990,14 +986,14 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())
 
 			var err error
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index b03b18bd..21e71d35 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -14,6 +14,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/templates"
+
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
@@ -24,7 +26,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	var id, textContentToReturn string
 	var created int
 
@@ -298,148 +300,10 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		// If we are using the tokenizer template, we don't need to process the messages
 		// unless we are processing functions
 		if !config.TemplateConfig.UseTokenizerTemplate || shouldUseFn {
-			suppressConfigSystemPrompt := false
-			mess := []string{}
-			for messageIndex, i := range input.Messages {
-				var content string
-				role := i.Role
-
-				// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
-				// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-				if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
-					roleFn := "assistant_function_call"
-					r := config.Roles[roleFn]
-					if r != "" {
-						role = roleFn
-					}
-				}
-				r := config.Roles[role]
-				contentExists := i.Content != nil && i.StringContent != ""
-
-				fcall := i.FunctionCall
-				if len(i.ToolCalls) > 0 {
-					fcall = i.ToolCalls
-				}
-
-				// First attempt to populate content via a chat message specific template
-				if config.TemplateConfig.ChatMessage != "" {
-					chatMessageData := model.ChatMessageTemplateData{
-						SystemPrompt: config.SystemPrompt,
-						Role:         r,
-						RoleName:     role,
-						Content:      i.StringContent,
-						FunctionCall: fcall,
-						FunctionName: i.Name,
-						LastMessage:  messageIndex == (len(input.Messages) - 1),
-						Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
-						MessageIndex: messageIndex,
-					}
-					templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
-					if err != nil {
-						log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
-					} else {
-						if templatedChatMessage == "" {
-							log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
-							continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
-						}
-						log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
-						content = templatedChatMessage
-					}
-				}
-
-				marshalAnyRole := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + fmt.Sprint(r, " ", string(j))
-						} else {
-							content = fmt.Sprint(r, " ", string(j))
-						}
-					}
-				}
-				marshalAny := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + string(j)
-						} else {
-							content = string(j)
-						}
-					}
-				}
-				// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
-				if content == "" {
-					if r != "" {
-						if contentExists {
-							content = fmt.Sprint(r, i.StringContent)
-						}
-
-						if i.FunctionCall != nil {
-							marshalAnyRole(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAnyRole(i.ToolCalls)
-						}
-					} else {
-						if contentExists {
-							content = fmt.Sprint(i.StringContent)
-						}
-						if i.FunctionCall != nil {
-							marshalAny(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAny(i.ToolCalls)
-						}
-					}
-					// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
-					if contentExists && role == "system" {
-						suppressConfigSystemPrompt = true
-					}
-				}
-
-				mess = append(mess, content)
-			}
-
-			joinCharacter := "\n"
-			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
-				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
-			}
-
-			predInput = strings.Join(mess, joinCharacter)
-			log.Debug().Msgf("Prompt (before templating): %s", predInput)
-
-			templateFile := ""
-
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-				templateFile = config.Model
-			}
-
-			if config.TemplateConfig.Chat != "" && !shouldUseFn {
-				templateFile = config.TemplateConfig.Chat
-			}
-
-			if config.TemplateConfig.Functions != "" && shouldUseFn {
-				templateFile = config.TemplateConfig.Functions
-			}
-
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt:         config.SystemPrompt,
-					SuppressSystemPrompt: suppressConfigSystemPrompt,
-					Input:                predInput,
-					Functions:            funcs,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				} else {
-					log.Debug().Msgf("Template failed loading: %s", err.Error())
-				}
-			}
+			predInput = evaluator.TemplateMessages(input.Messages, config, funcs, shouldUseFn)
 
 			log.Debug().Msgf("Prompt (after templating): %s", predInput)
-			if shouldUseFn && config.Grammar != "" {
+			if config.Grammar != "" {
 				log.Debug().Msgf("Grammar: %+v", config.Grammar)
 			}
 		}
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index e5de1b3f..04ebc847 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -16,6 +16,7 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -25,7 +26,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
 
@@ -94,17 +95,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			c.Set("Transfer-Encoding", "chunked")
 		}
 
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Completion != "" {
-			templateFile = config.TemplateConfig.Completion
-		}
-
 		if input.Stream {
 			if len(config.PromptStrings) > 1 {
 				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
@@ -112,15 +102,13 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 
 			predInput := config.PromptStrings[0]
 
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        predInput,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
+				Input:        predInput,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}
 
 			responses := make(chan schema.OpenAIResponse)
@@ -165,16 +153,13 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 		totalTokenUsage := backend.TokenUsage{}
 
 		for k, i := range config.PromptStrings {
-			if templateFile != "" {
-				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Input:        i,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
 
 			r, tokenUsage, err := ComputeChoices(
diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go
index 12fb4035..a6d609fb 100644
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -12,6 +12,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
 	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
 
 	"github.com/rs/zerolog/log"
 )
@@ -21,7 +22,8 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/edits [post]
-func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+
 	return func(c *fiber.Ctx) error {
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
 		if err != nil {
@@ -35,31 +37,18 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConf
 
 		log.Debug().Msgf("Parameter Config: %+v", config)
 
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Edit != "" {
-			templateFile = config.TemplateConfig.Edit
-		}
-
 		var result []schema.Choice
 		totalTokenUsage := backend.TokenUsage{}
 
 		for _, i := range config.InputStrings {
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        i,
-					Instruction:  input.Instruction,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.EditPromptTemplate, *config, templates.PromptTemplateData{
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
 
 			r, tokenUsage, err := ComputeChoices(input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index e7097741..2ea9896a 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -11,62 +11,62 @@ import (
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
-func RegisterLocalAIRoutes(app *fiber.App,
+func RegisterLocalAIRoutes(router *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	galleryService *services.GalleryService) {
 
-	app.Get("/swagger/*", swagger.HandlerDefault) // default
+	router.Get("/swagger/*", swagger.HandlerDefault) // default
 
 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
 		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-		app.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-		app.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+		router.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+		router.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
 
-		app.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-		app.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
-		app.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
-		app.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-		app.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
-		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
+		router.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+		router.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
+		router.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
+		router.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+		router.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
+		router.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}
 
-	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
-	app.Post("/vad", localai.VADEndpoint(cl, ml, appConfig))
+	router.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
+	router.Post("/vad", localai.VADEndpoint(cl, ml, appConfig))
 
 	// Stores
 	sl := model.NewModelLoader("")
-	app.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
 
 	if !appConfig.DisableMetrics {
-		app.Get("/metrics", localai.LocalAIMetricsEndpoint())
+		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
 	}
 
 	// Experimental Backend Statistics Module
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
-	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
-	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
+	router.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
+	router.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
 
 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
-		app.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
+		router.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
+		router.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
 	}
 
-	app.Get("/version", func(c *fiber.Ctx) error {
+	router.Get("/version", func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})
 
-	app.Get("/system", localai.SystemInformations(ml, appConfig))
+	router.Get("/system", localai.SystemInformations(ml, appConfig))
 
 	// misc
-	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
+	router.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
 
 }
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 081daf70..5ff301b6 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -2,84 +2,134 @@ package routes
 
 import (
 	"github.com/gofiber/fiber/v2"
-	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
-	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterOpenAIRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig) {
+	application *application.Application) {
 	// openAI compatible API endpoint
 
 	// chat
-	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions",
+		openai.ChatEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/chat/completions",
+		openai.ChatEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// edit
-	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits",
+		openai.EditEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/edits",
+		openai.EditEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// assistant
-	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// files
-	app.Post("/v1/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Post("/v1/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Post("/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 
 	// completion
-	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/v1/engines/:model/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// embeddings
-	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// audio
-	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/audio/speech", localai.TTSEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// images
-	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
-	if appConfig.ImageDir != "" {
-		app.Static("/generated-images", appConfig.ImageDir)
+	if application.ApplicationConfig().ImageDir != "" {
+		app.Static("/generated-images", application.ApplicationConfig().ImageDir)
 	}
 
-	if appConfig.AudioDir != "" {
-		app.Static("/generated-audio", appConfig.AudioDir)
+	if application.ApplicationConfig().AudioDir != "" {
+		app.Static("/generated-audio", application.ApplicationConfig().AudioDir)
 	}
 
 	// List models
-	app.Get("/v1/models", openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader()))
+	app.Get("/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader()))
 }
diff --git a/go.mod b/go.mod
index 3bc625ac..e9bcf3ec 100644
--- a/go.mod
+++ b/go.mod
@@ -76,6 +76,7 @@ require (
 	cloud.google.com/go/auth v0.4.1 // indirect
 	cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
+	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
 	github.com/fasthttp/websocket v1.5.3 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
@@ -84,8 +85,12 @@ require (
 	github.com/google/s2a-go v0.1.7 // indirect
 	github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
 	github.com/googleapis/gax-go/v2 v2.12.4 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/moby/docker-image-spec v1.3.1 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/nikolalohinski/gonja/v2 v2.3.2 // indirect
 	github.com/pion/datachannel v1.5.8 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
 	github.com/pion/ice/v2 v2.3.34 // indirect
diff --git a/go.sum b/go.sum
index 11b87fa9..f1628f7a 100644
--- a/go.sum
+++ b/go.sum
@@ -140,6 +140,8 @@ github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L
 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 github.com/elastic/gosigar v0.12.0/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
 github.com/elastic/gosigar v0.14.3 h1:xwkKwPia+hSfg9GqrCUKYdId102m9qTJIIr7egmK/uo=
 github.com/elastic/gosigar v0.14.3/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
@@ -268,6 +270,7 @@ github.com/google/go-containerregistry v0.19.2 h1:TannFKE1QSajsP6hPWb5oJNgKe1IKj
 github.com/google/go-containerregistry v0.19.2/go.mod h1:YCMFNQeeXeLF+dnhhWkqDItx/JSkH01j1Kis4PsjzFI=
 github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ=
 github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8=
 github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo=
 github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
@@ -353,6 +356,8 @@ github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwA
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
@@ -474,8 +479,12 @@ github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5
 github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo=
 github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
 github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/mr-tron/base58 v1.1.2/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
 github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
@@ -519,6 +528,9 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo=
 github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM=
+github.com/nikolalohinski/gonja v1.5.3 h1:GsA+EEaZDZPGJ8JtpeGN78jidhOlxeJROpqMT9fTj9c=
+github.com/nikolalohinski/gonja/v2 v2.3.2 h1:UgLFfqi7L9XfX0PEcE4eUpvGojVQL5KhBfJJaBp7ZxY=
+github.com/nikolalohinski/gonja/v2 v2.3.2/go.mod h1:1Wcc/5huTu6y36e0sOFR1XQoFlylw3c3H3L5WOz0RDg=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index b32e3745..d62f52b2 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -9,8 +9,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/mudler/LocalAI/pkg/templates"
-
 	"github.com/mudler/LocalAI/pkg/utils"
 
 	"github.com/rs/zerolog/log"
@@ -23,7 +21,6 @@ type ModelLoader struct {
 	ModelPath string
 	mu        sync.Mutex
 	models    map[string]*Model
-	templates *templates.TemplateCache
 	wd        *WatchDog
 }
 
@@ -31,7 +28,6 @@ func NewModelLoader(modelPath string) *ModelLoader {
 	nml := &ModelLoader{
 		ModelPath: modelPath,
 		models:    make(map[string]*Model),
-		templates: templates.NewTemplateCache(modelPath),
 	}
 
 	return nml
diff --git a/pkg/model/template.go b/pkg/model/template.go
deleted file mode 100644
index 3dc850cf..00000000
--- a/pkg/model/template.go
+++ /dev/null
@@ -1,52 +0,0 @@
-package model
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/templates"
-)
-
-// Rather than pass an interface{} to the prompt template:
-// These are the definitions of all possible variables LocalAI will currently populate for use in a prompt template file
-// Please note: Not all of these are populated on every endpoint - your template should either be tested for each endpoint you map it to, or tolerant of zero values.
-type PromptTemplateData struct {
-	SystemPrompt         string
-	SuppressSystemPrompt bool // used by chat specifically to indicate that SystemPrompt above should be _ignored_
-	Input                string
-	Instruction          string
-	Functions            []functions.Function
-	MessageIndex         int
-}
-
-type ChatMessageTemplateData struct {
-	SystemPrompt string
-	Role         string
-	RoleName     string
-	FunctionName string
-	Content      string
-	MessageIndex int
-	Function     bool
-	FunctionCall interface{}
-	LastMessage  bool
-}
-
-const (
-	ChatPromptTemplate templates.TemplateType = iota
-	ChatMessageTemplate
-	CompletionPromptTemplate
-	EditPromptTemplate
-	FunctionsPromptTemplate
-)
-
-func (ml *ModelLoader) EvaluateTemplateForPrompt(templateType templates.TemplateType, templateName string, in PromptTemplateData) (string, error) {
-	// TODO: should this check be improved?
-	if templateType == ChatMessageTemplate {
-		return "", fmt.Errorf("invalid templateType: ChatMessage")
-	}
-	return ml.templates.EvaluateTemplate(templateType, templateName, in)
-}
-
-func (ml *ModelLoader) EvaluateTemplateForChatMessage(templateName string, messageData ChatMessageTemplateData) (string, error) {
-	return ml.templates.EvaluateTemplate(ChatMessageTemplate, templateName, messageData)
-}
diff --git a/pkg/model/template_test.go b/pkg/model/template_test.go
deleted file mode 100644
index 1142ed0c..00000000
--- a/pkg/model/template_test.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package model_test
-
-import (
-	. "github.com/mudler/LocalAI/pkg/model"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-const chatML = `<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-{{- if .FunctionCall }}
-<tool_call>
-{{- else if eq .RoleName "tool" }}
-<tool_response>
-{{- end }}
-{{- if .Content}}
-{{.Content }}
-{{- end }}
-{{- if .FunctionCall}}
-{{toJson .FunctionCall}}
-{{- end }}
-{{- if .FunctionCall }}
-</tool_call>
-{{- else if eq .RoleName "tool" }}
-</tool_response>
-{{- end }}<|im_end|>`
-
-const llama3 = `<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content -}}
-{{ else if .FunctionCall -}}
-{{ toJson .FunctionCall -}}
-{{ end -}}
-<|eot_id|>`
-
-var llama3TestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
-	"user": {
-		"template": llama3,
-		"expected": "<|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "user",
-			RoleName:     "user",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"assistant": {
-		"template": llama3,
-		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_call": {
-		"template": llama3,
-		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nFunction call:\n{\"function\":\"test\"}<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "",
-			FunctionCall: map[string]string{"function": "test"},
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_response": {
-		"template": llama3,
-		"expected": "<|start_header_id|>tool<|end_header_id|>\n\nFunction response:\nResponse from tool<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "tool",
-			RoleName:     "tool",
-			Content:      "Response from tool",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-}
-
-var chatMLTestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
-	"user": {
-		"template": chatML,
-		"expected": "<|im_start|>user\nA long time ago in a galaxy far, far away...<|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "user",
-			RoleName:     "user",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"assistant": {
-		"template": chatML,
-		"expected": "<|im_start|>assistant\nA long time ago in a galaxy far, far away...<|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_call": {
-		"template": chatML,
-		"expected": "<|im_start|>assistant\n<tool_call>\n{\"function\":\"test\"}\n</tool_call><|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "",
-			FunctionCall: map[string]string{"function": "test"},
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_response": {
-		"template": chatML,
-		"expected": "<|im_start|>tool\n<tool_response>\nResponse from tool\n</tool_response><|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "tool",
-			RoleName:     "tool",
-			Content:      "Response from tool",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-}
-
-var _ = Describe("Templates", func() {
-	Context("chat message ChatML", func() {
-		var modelLoader *ModelLoader
-		BeforeEach(func() {
-			modelLoader = NewModelLoader("")
-		})
-		for key := range chatMLTestMatch {
-			foo := chatMLTestMatch[key]
-			It("renders correctly `"+key+"`", func() {
-				templated, err := modelLoader.EvaluateTemplateForChatMessage(foo["template"].(string), foo["data"].(ChatMessageTemplateData))
-				Expect(err).ToNot(HaveOccurred())
-				Expect(templated).To(Equal(foo["expected"]), templated)
-			})
-		}
-	})
-	Context("chat message llama3", func() {
-		var modelLoader *ModelLoader
-		BeforeEach(func() {
-			modelLoader = NewModelLoader("")
-		})
-		for key := range llama3TestMatch {
-			foo := llama3TestMatch[key]
-			It("renders correctly `"+key+"`", func() {
-				templated, err := modelLoader.EvaluateTemplateForChatMessage(foo["template"].(string), foo["data"].(ChatMessageTemplateData))
-				Expect(err).ToNot(HaveOccurred())
-				Expect(templated).To(Equal(foo["expected"]), templated)
-			})
-		}
-	})
-})
diff --git a/pkg/templates/cache.go b/pkg/templates/cache.go
index e4801946..1efce660 100644
--- a/pkg/templates/cache.go
+++ b/pkg/templates/cache.go
@@ -11,59 +11,41 @@ import (
 	"github.com/mudler/LocalAI/pkg/utils"
 
 	"github.com/Masterminds/sprig/v3"
+
+	"github.com/nikolalohinski/gonja/v2"
+	"github.com/nikolalohinski/gonja/v2/exec"
 )
 
 // Keep this in sync with config.TemplateConfig. Is there a more idiomatic way to accomplish this in go?
 // Technically, order doesn't _really_ matter, but the count must stay in sync, see tests/integration/reflect_test.go
 type TemplateType int
 
-type TemplateCache struct {
-	mu            sync.Mutex
-	templatesPath string
-	templates     map[TemplateType]map[string]*template.Template
+type templateCache struct {
+	mu             sync.Mutex
+	templatesPath  string
+	templates      map[TemplateType]map[string]*template.Template
+	jinjaTemplates map[TemplateType]map[string]*exec.Template
 }
 
-func NewTemplateCache(templatesPath string) *TemplateCache {
-	tc := &TemplateCache{
-		templatesPath: templatesPath,
-		templates:     make(map[TemplateType]map[string]*template.Template),
+func newTemplateCache(templatesPath string) *templateCache {
+	tc := &templateCache{
+		templatesPath:  templatesPath,
+		templates:      make(map[TemplateType]map[string]*template.Template),
+		jinjaTemplates: make(map[TemplateType]map[string]*exec.Template),
 	}
 	return tc
 }
 
-func (tc *TemplateCache) initializeTemplateMapKey(tt TemplateType) {
+func (tc *templateCache) initializeTemplateMapKey(tt TemplateType) {
 	if _, ok := tc.templates[tt]; !ok {
 		tc.templates[tt] = make(map[string]*template.Template)
 	}
 }
 
-func (tc *TemplateCache) EvaluateTemplate(templateType TemplateType, templateName string, in interface{}) (string, error) {
-	tc.mu.Lock()
-	defer tc.mu.Unlock()
-
-	tc.initializeTemplateMapKey(templateType)
-	m, ok := tc.templates[templateType][templateName]
-	if !ok {
-		// return "", fmt.Errorf("template not loaded: %s", templateName)
-		loadErr := tc.loadTemplateIfExists(templateType, templateName)
-		if loadErr != nil {
-			return "", loadErr
-		}
-		m = tc.templates[templateType][templateName] // ok is not important since we check m on the next line, and wealready checked
-	}
-	if m == nil {
-		return "", fmt.Errorf("failed loading a template for %s", templateName)
-	}
-
-	var buf bytes.Buffer
-
-	if err := m.Execute(&buf, in); err != nil {
-		return "", err
-	}
-	return buf.String(), nil
+func (tc *templateCache) existsInModelPath(s string) bool {
+	return utils.ExistsInPath(tc.templatesPath, s)
 }
-
-func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templateName string) error {
+func (tc *templateCache) loadTemplateIfExists(templateType TemplateType, templateName string) error {
 
 	// Check if the template was already loaded
 	if _, ok := tc.templates[templateType][templateName]; ok {
@@ -82,6 +64,51 @@ func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templat
 		return fmt.Errorf("template file outside path: %s", file)
 	}
 
+	// can either be a file in the system or a string with the template
+	if tc.existsInModelPath(modelTemplateFile) {
+		d, err := os.ReadFile(file)
+		if err != nil {
+			return err
+		}
+		dat = string(d)
+	} else {
+		dat = templateName
+	}
+
+	// Parse the template
+	tmpl, err := template.New("prompt").Funcs(sprig.FuncMap()).Parse(dat)
+	if err != nil {
+		return err
+	}
+	tc.templates[templateType][templateName] = tmpl
+
+	return nil
+}
+
+func (tc *templateCache) initializeJinjaTemplateMapKey(tt TemplateType) {
+	if _, ok := tc.jinjaTemplates[tt]; !ok {
+		tc.jinjaTemplates[tt] = make(map[string]*exec.Template)
+	}
+}
+
+func (tc *templateCache) loadJinjaTemplateIfExists(templateType TemplateType, templateName string) error {
+	// Check if the template was already loaded
+	if _, ok := tc.jinjaTemplates[templateType][templateName]; ok {
+		return nil
+	}
+
+	// Check if the model path exists
+	// skip any error here - we run anyway if a template does not exist
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", templateName)
+
+	dat := ""
+	file := filepath.Join(tc.templatesPath, modelTemplateFile)
+
+	// Security check
+	if err := utils.VerifyPath(modelTemplateFile, tc.templatesPath); err != nil {
+		return fmt.Errorf("template file outside path: %s", file)
+	}
+
 	// can either be a file in the system or a string with the template
 	if utils.ExistsInPath(tc.templatesPath, modelTemplateFile) {
 		d, err := os.ReadFile(file)
@@ -93,12 +120,65 @@ func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templat
 		dat = templateName
 	}
 
-	// Parse the template
-	tmpl, err := template.New("prompt").Funcs(sprig.FuncMap()).Parse(dat)
+	tmpl, err := gonja.FromString(dat)
 	if err != nil {
 		return err
 	}
-	tc.templates[templateType][templateName] = tmpl
+	tc.jinjaTemplates[templateType][templateName] = tmpl
 
 	return nil
 }
+
+func (tc *templateCache) evaluateJinjaTemplate(templateType TemplateType, templateNameOrContent string, in map[string]interface{}) (string, error) {
+	tc.mu.Lock()
+	defer tc.mu.Unlock()
+
+	tc.initializeJinjaTemplateMapKey(templateType)
+	m, ok := tc.jinjaTemplates[templateType][templateNameOrContent]
+	if !ok {
+		// return "", fmt.Errorf("template not loaded: %s", templateName)
+		loadErr := tc.loadJinjaTemplateIfExists(templateType, templateNameOrContent)
+		if loadErr != nil {
+			return "", loadErr
+		}
+		m = tc.jinjaTemplates[templateType][templateNameOrContent] // ok is not important since we check m on the next line, and wealready checked
+	}
+	if m == nil {
+		return "", fmt.Errorf("failed loading a template for %s", templateNameOrContent)
+	}
+
+	var buf bytes.Buffer
+
+	data := exec.NewContext(in)
+
+	if err := m.Execute(&buf, data); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
+
+func (tc *templateCache) evaluateTemplate(templateType TemplateType, templateNameOrContent string, in interface{}) (string, error) {
+	tc.mu.Lock()
+	defer tc.mu.Unlock()
+
+	tc.initializeTemplateMapKey(templateType)
+	m, ok := tc.templates[templateType][templateNameOrContent]
+	if !ok {
+		// return "", fmt.Errorf("template not loaded: %s", templateName)
+		loadErr := tc.loadTemplateIfExists(templateType, templateNameOrContent)
+		if loadErr != nil {
+			return "", loadErr
+		}
+		m = tc.templates[templateType][templateNameOrContent] // ok is not important since we check m on the next line, and wealready checked
+	}
+	if m == nil {
+		return "", fmt.Errorf("failed loading a template for %s", templateNameOrContent)
+	}
+
+	var buf bytes.Buffer
+
+	if err := m.Execute(&buf, in); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
diff --git a/pkg/templates/cache_test.go b/pkg/templates/cache_test.go
deleted file mode 100644
index 8bb50766..00000000
--- a/pkg/templates/cache_test.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package templates_test
-
-import (
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/pkg/templates" // Update with your module path
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("TemplateCache", func() {
-	var (
-		templateCache *templates.TemplateCache
-		tempDir       string
-	)
-
-	BeforeEach(func() {
-		var err error
-		tempDir, err = os.MkdirTemp("", "templates")
-		Expect(err).NotTo(HaveOccurred())
-
-		// Writing example template files
-		err = os.WriteFile(filepath.Join(tempDir, "example.tmpl"), []byte("Hello, {{.Name}}!"), 0600)
-		Expect(err).NotTo(HaveOccurred())
-		err = os.WriteFile(filepath.Join(tempDir, "empty.tmpl"), []byte(""), 0600)
-		Expect(err).NotTo(HaveOccurred())
-
-		templateCache = templates.NewTemplateCache(tempDir)
-	})
-
-	AfterEach(func() {
-		os.RemoveAll(tempDir) // Clean up
-	})
-
-	Describe("EvaluateTemplate", func() {
-		Context("when template is loaded successfully", func() {
-			It("should evaluate the template correctly", func() {
-				result, err := templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-				Expect(err).NotTo(HaveOccurred())
-				Expect(result).To(Equal("Hello, Gopher!"))
-			})
-		})
-
-		Context("when template isn't a file", func() {
-			It("should parse from string", func() {
-				result, err := templateCache.EvaluateTemplate(1, "{{.Name}}", map[string]string{"Name": "Gopher"})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(result).To(Equal("Gopher"))
-			})
-		})
-
-		Context("when template is empty", func() {
-			It("should return an empty string", func() {
-				result, err := templateCache.EvaluateTemplate(1, "empty", nil)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(result).To(Equal(""))
-			})
-		})
-	})
-
-	Describe("concurrency", func() {
-		It("should handle multiple concurrent accesses", func(done Done) {
-			go func() {
-				_, _ = templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-			}()
-			go func() {
-				_, _ = templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-			}()
-			close(done)
-		}, 0.1) // timeout in seconds
-	})
-})
diff --git a/pkg/templates/evaluator.go b/pkg/templates/evaluator.go
new file mode 100644
index 00000000..aedf7b41
--- /dev/null
+++ b/pkg/templates/evaluator.go
@@ -0,0 +1,295 @@
+package templates
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/rs/zerolog/log"
+)
+
+// Rather than pass an interface{} to the prompt template:
+// These are the definitions of all possible variables LocalAI will currently populate for use in a prompt template file
+// Please note: Not all of these are populated on every endpoint - your template should either be tested for each endpoint you map it to, or tolerant of zero values.
+type PromptTemplateData struct {
+	SystemPrompt         string
+	SuppressSystemPrompt bool // used by chat specifically to indicate that SystemPrompt above should be _ignored_
+	Input                string
+	Instruction          string
+	Functions            []functions.Function
+	MessageIndex         int
+}
+
+type ChatMessageTemplateData struct {
+	SystemPrompt string
+	Role         string
+	RoleName     string
+	FunctionName string
+	Content      string
+	MessageIndex int
+	Function     bool
+	FunctionCall interface{}
+	LastMessage  bool
+}
+
+const (
+	ChatPromptTemplate TemplateType = iota
+	ChatMessageTemplate
+	CompletionPromptTemplate
+	EditPromptTemplate
+	FunctionsPromptTemplate
+)
+
+type Evaluator struct {
+	cache *templateCache
+}
+
+func NewEvaluator(modelPath string) *Evaluator {
+	return &Evaluator{
+		cache: newTemplateCache(modelPath),
+	}
+}
+
+func (e *Evaluator) EvaluateTemplateForPrompt(templateType TemplateType, config config.BackendConfig, in PromptTemplateData) (string, error) {
+	template := ""
+
+	// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+	if e.cache.existsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+		template = config.Model
+	}
+
+	switch templateType {
+	case CompletionPromptTemplate:
+		if config.TemplateConfig.Completion != "" {
+			template = config.TemplateConfig.Completion
+		}
+	case EditPromptTemplate:
+		if config.TemplateConfig.Edit != "" {
+			template = config.TemplateConfig.Edit
+		}
+	case ChatPromptTemplate:
+		if config.TemplateConfig.Chat != "" {
+			template = config.TemplateConfig.Chat
+		}
+	case FunctionsPromptTemplate:
+		if config.TemplateConfig.Functions != "" {
+			template = config.TemplateConfig.Functions
+		}
+	}
+
+	if template == "" {
+		return in.Input, nil
+	}
+
+	if config.TemplateConfig.JinjaTemplate {
+		return e.evaluateJinjaTemplateForPrompt(templateType, template, in)
+	}
+
+	return e.cache.evaluateTemplate(templateType, template, in)
+}
+
+func (e *Evaluator) evaluateTemplateForChatMessage(templateName string, messageData ChatMessageTemplateData) (string, error) {
+	return e.cache.evaluateTemplate(ChatMessageTemplate, templateName, messageData)
+}
+
+func (e *Evaluator) templateJinjaChat(templateName string, messageData []ChatMessageTemplateData, funcs []functions.Function) (string, error) {
+
+	conversation := make(map[string]interface{})
+	messages := make([]map[string]interface{}, len(messageData))
+
+	// convert from ChatMessageTemplateData to what the jinja template expects
+
+	for _, message := range messageData {
+		// TODO: this seems to cover minimum text templates. Can be expanded to cover more complex interactions
+		var data []byte
+		data, _ = json.Marshal(message.FunctionCall)
+		messages = append(messages, map[string]interface{}{
+			"role":      message.RoleName,
+			"content":   message.Content,
+			"tool_call": string(data),
+		})
+	}
+
+	conversation["messages"] = messages
+
+	// if tools are detected, add these
+	if len(funcs) > 0 {
+		conversation["tools"] = funcs
+	}
+
+	return e.cache.evaluateJinjaTemplate(ChatMessageTemplate, templateName, conversation)
+}
+
+func (e *Evaluator) evaluateJinjaTemplateForPrompt(templateType TemplateType, templateName string, in PromptTemplateData) (string, error) {
+
+	conversation := make(map[string]interface{})
+
+	conversation["system_prompt"] = in.SystemPrompt
+	conversation["content"] = in.Input
+
+	return e.cache.evaluateJinjaTemplate(templateType, templateName, conversation)
+}
+
+func (e *Evaluator) TemplateMessages(messages []schema.Message, config *config.BackendConfig, funcs []functions.Function, shouldUseFn bool) string {
+
+	if config.TemplateConfig.JinjaTemplate {
+		var messageData []ChatMessageTemplateData
+		for messageIndex, i := range messages {
+			fcall := i.FunctionCall
+			if len(i.ToolCalls) > 0 {
+				fcall = i.ToolCalls
+			}
+			messageData = append(messageData, ChatMessageTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Role:         config.Roles[i.Role],
+				RoleName:     i.Role,
+				Content:      i.StringContent,
+				FunctionCall: fcall,
+				FunctionName: i.Name,
+				LastMessage:  messageIndex == (len(messages) - 1),
+				Function:     config.Grammar != "" && (messageIndex == (len(messages) - 1)),
+				MessageIndex: messageIndex,
+			})
+		}
+
+		templatedInput, err := e.templateJinjaChat(config.TemplateConfig.ChatMessage, messageData, funcs)
+		if err == nil {
+			return templatedInput
+		}
+	}
+
+	var predInput string
+	suppressConfigSystemPrompt := false
+	mess := []string{}
+	for messageIndex, i := range messages {
+		var content string
+		role := i.Role
+
+		// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+		// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+		if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
+			roleFn := "assistant_function_call"
+			r := config.Roles[roleFn]
+			if r != "" {
+				role = roleFn
+			}
+		}
+		r := config.Roles[role]
+		contentExists := i.Content != nil && i.StringContent != ""
+
+		fcall := i.FunctionCall
+		if len(i.ToolCalls) > 0 {
+			fcall = i.ToolCalls
+		}
+
+		// First attempt to populate content via a chat message specific template
+		if config.TemplateConfig.ChatMessage != "" {
+			chatMessageData := ChatMessageTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Role:         r,
+				RoleName:     role,
+				Content:      i.StringContent,
+				FunctionCall: fcall,
+				FunctionName: i.Name,
+				LastMessage:  messageIndex == (len(messages) - 1),
+				Function:     config.Grammar != "" && (messageIndex == (len(messages) - 1)),
+				MessageIndex: messageIndex,
+			}
+			templatedChatMessage, err := e.evaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+			if err != nil {
+				log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
+			} else {
+				if templatedChatMessage == "" {
+					log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
+					continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+				}
+				log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+				content = templatedChatMessage
+			}
+		}
+
+		marshalAnyRole := func(f any) {
+			j, err := json.Marshal(f)
+			if err == nil {
+				if contentExists {
+					content += "\n" + fmt.Sprint(r, " ", string(j))
+				} else {
+					content = fmt.Sprint(r, " ", string(j))
+				}
+			}
+		}
+		marshalAny := func(f any) {
+			j, err := json.Marshal(f)
+			if err == nil {
+				if contentExists {
+					content += "\n" + string(j)
+				} else {
+					content = string(j)
+				}
+			}
+		}
+		// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+		if content == "" {
+			if r != "" {
+				if contentExists {
+					content = fmt.Sprint(r, i.StringContent)
+				}
+
+				if i.FunctionCall != nil {
+					marshalAnyRole(i.FunctionCall)
+				}
+				if i.ToolCalls != nil {
+					marshalAnyRole(i.ToolCalls)
+				}
+			} else {
+				if contentExists {
+					content = fmt.Sprint(i.StringContent)
+				}
+				if i.FunctionCall != nil {
+					marshalAny(i.FunctionCall)
+				}
+				if i.ToolCalls != nil {
+					marshalAny(i.ToolCalls)
+				}
+			}
+			// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+			if contentExists && role == "system" {
+				suppressConfigSystemPrompt = true
+			}
+		}
+
+		mess = append(mess, content)
+	}
+
+	joinCharacter := "\n"
+	if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+		joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+	}
+
+	predInput = strings.Join(mess, joinCharacter)
+	log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+	promptTemplate := ChatPromptTemplate
+
+	if config.TemplateConfig.Functions != "" && shouldUseFn {
+		promptTemplate = FunctionsPromptTemplate
+	}
+
+	templatedInput, err := e.EvaluateTemplateForPrompt(promptTemplate, *config, PromptTemplateData{
+		SystemPrompt:         config.SystemPrompt,
+		SuppressSystemPrompt: suppressConfigSystemPrompt,
+		Input:                predInput,
+		Functions:            funcs,
+	})
+	if err == nil {
+		predInput = templatedInput
+		log.Debug().Msgf("Template found, input modified to: %s", predInput)
+	} else {
+		log.Debug().Msgf("Template failed loading: %s", err.Error())
+	}
+
+	return predInput
+}
diff --git a/pkg/templates/evaluator_test.go b/pkg/templates/evaluator_test.go
new file mode 100644
index 00000000..b58dd40b
--- /dev/null
+++ b/pkg/templates/evaluator_test.go
@@ -0,0 +1,253 @@
+package templates_test
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/functions"
+	. "github.com/mudler/LocalAI/pkg/templates"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+const toolCallJinja = `{{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>
+
+' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>
+
+' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}`
+
+const chatML = `<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+{{- if .FunctionCall }}
+<tool_call>
+{{- else if eq .RoleName "tool" }}
+<tool_response>
+{{- end }}
+{{- if .Content}}
+{{.Content }}
+{{- end }}
+{{- if .FunctionCall}}
+{{toJson .FunctionCall}}
+{{- end }}
+{{- if .FunctionCall }}
+</tool_call>
+{{- else if eq .RoleName "tool" }}
+</tool_response>
+{{- end }}<|im_end|>`
+
+const llama3 = `<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content -}}
+{{ else if .FunctionCall -}}
+{{ toJson .FunctionCall -}}
+{{ end -}}
+<|eot_id|>`
+
+var llama3TestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+	"assistant": {
+		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "assistant",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_call": {
+
+		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nFunction call:\n{\"function\":\"test\"}<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:         "assistant",
+				FunctionCall: map[string]string{"function": "test"},
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_response": {
+		"expected": "<|start_header_id|>tool<|end_header_id|>\n\nFunction response:\nResponse from tool<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "tool",
+				StringContent: "Response from tool",
+			},
+		},
+		"shouldUseFn": false,
+	},
+}
+
+var chatMLTestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|im_start|>user\nA long time ago in a galaxy far, far away...<|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+	"assistant": {
+		"expected": "<|im_start|>assistant\nA long time ago in a galaxy far, far away...<|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "assistant",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_call": {
+		"expected": "<|im_start|>assistant\n<tool_call>\n{\"function\":\"test\"}\n</tool_call><|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions": []functions.Function{
+			{
+				Name:        "test",
+				Description: "test",
+				Parameters:  nil,
+			},
+		},
+		"shouldUseFn": true,
+		"messages": []schema.Message{
+			{
+				Role:         "assistant",
+				FunctionCall: map[string]string{"function": "test"},
+			},
+		},
+	},
+	"function_response": {
+		"expected": "<|im_start|>tool\n<tool_response>\nResponse from tool\n</tool_response><|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "tool",
+				StringContent: "Response from tool",
+			},
+		},
+	},
+}
+
+var jinjaTest map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage:   toolCallJinja,
+				JinjaTemplate: true,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+}
+var _ = Describe("Templates", func() {
+	Context("chat message ChatML", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range chatMLTestMatch {
+			foo := chatMLTestMatch[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+	Context("chat message llama3", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range llama3TestMatch {
+			foo := llama3TestMatch[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+	Context("chat message jinja", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range jinjaTest {
+			foo := jinjaTest[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+})

From f943c4b803b99efc587ff126d3766a6ada19db20 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 8 Dec 2024 17:53:36 +0100
Subject: [PATCH 45/89] Revert "feat: include tokens usage for streamed output"
 (#4336)

Revert "feat: include tokens usage for streamed output (#4282)"

This reverts commit 0d6c3a7d57101428aec4100d0f7bca765ee684a7.
---
 core/backend/llm.go                | 12 ++----------
 core/http/endpoints/openai/chat.go |  9 +--------
 pkg/grpc/backend.go                |  2 +-
 pkg/grpc/client.go                 |  6 +++---
 pkg/grpc/embed.go                  |  6 +++---
 5 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/core/backend/llm.go b/core/backend/llm.go
index 9e121f79..4491a191 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -117,12 +117,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			ss := ""
 
 			var partialRune []byte
-			err := inferenceModel.PredictStream(ctx, opts, func(reply *proto.Reply) {
-				msg := reply.GetMessage()
-				partialRune = append(partialRune, msg...)
-
-				tokenUsage.Prompt = int(reply.PromptTokens)
-				tokenUsage.Completion = int(reply.Tokens)
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+				partialRune = append(partialRune, chars...)
 
 				for len(partialRune) > 0 {
 					r, size := utf8.DecodeRune(partialRune)
@@ -136,10 +132,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 
 					partialRune = partialRune[size:]
 				}
-
-				if len(msg) == 0 {
-					tokenCallback("", tokenUsage)
-				}
 			})
 			return LLMResponse{
 				Response: ss,
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 21e71d35..c2b201bd 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -41,15 +41,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 		responses <- initialMessage
 
 		ComputeChoices(req, s, config, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
-			choices := []schema.Choice{}
-			if s != "" {
-				choices = append(choices, schema.Choice{Delta: &schema.Message{Content: &s}, Index: 0})
-			}
 			resp := schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: choices,
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
 				Object:  "chat.completion.chunk",
 				Usage: schema.OpenAIUsage{
 					PromptTokens:     usage.Prompt,
@@ -333,9 +329,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 				toolsCalled := false
 				for ev := range responses {
 					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
-					if len(ev.Choices) == 0 {
-						break
-					}
 					if len(ev.Choices[0].Delta.ToolCalls) > 0 {
 						toolsCalled = true
 					}
diff --git a/pkg/grpc/backend.go b/pkg/grpc/backend.go
index fabc0268..21435891 100644
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -37,7 +37,7 @@ type Backend interface {
 	Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error)
 	Predict(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.Reply, error)
 	LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grpc.CallOption) (*pb.Result, error)
-	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error
+	PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error
 	GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index ca207c3f..9c8b302e 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -136,7 +136,7 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 	return client.LoadModel(ctx, in, opts...)
 }
 
-func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error {
+func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
 	if !c.parallel {
 		c.opMutex.Lock()
 		defer c.opMutex.Unlock()
@@ -158,7 +158,7 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 	}
 
 	for {
-		reply, err := stream.Recv()
+		feature, err := stream.Recv()
 		if err == io.EOF {
 			break
 		}
@@ -167,7 +167,7 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 
 			return err
 		}
-		f(reply)
+		f(feature.GetMessage())
 	}
 
 	return nil
diff --git a/pkg/grpc/embed.go b/pkg/grpc/embed.go
index 79648c5a..a5828a5f 100644
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -35,7 +35,7 @@ func (e *embedBackend) LoadModel(ctx context.Context, in *pb.ModelOptions, opts
 	return e.s.LoadModel(ctx, in)
 }
 
-func (e *embedBackend) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(reply *pb.Reply), opts ...grpc.CallOption) error {
+func (e *embedBackend) PredictStream(ctx context.Context, in *pb.PredictOptions, f func(s []byte), opts ...grpc.CallOption) error {
 	bs := &embedBackendServerStream{
 		ctx: ctx,
 		fn:  f,
@@ -97,11 +97,11 @@ func (e *embedBackend) GetTokenMetrics(ctx context.Context, in *pb.MetricsReques
 
 type embedBackendServerStream struct {
 	ctx context.Context
-	fn  func(reply *pb.Reply)
+	fn  func(s []byte)
 }
 
 func (e *embedBackendServerStream) Send(reply *pb.Reply) error {
-	e.fn(reply)
+	e.fn(reply.GetMessage())
 	return nil
 }
 

From a0fe05005586353844e7704c2b87c6f55a7240c8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 8 Dec 2024 18:01:16 +0100
Subject: [PATCH 46/89] chore(model gallery): add mn-chunky-lotus-12b (#4337)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index c94358b6..43f34430 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4410,6 +4410,30 @@
     - filename: Chatty-Harry_V3.0.Q4_K_M.gguf
       sha256: 54b63bb74498576ca77b801ed096657a93cc2f6b71d707c3605fdb394bd3e622
       uri: huggingface://QuantFactory/Chatty-Harry_V3.0-GGUF/Chatty-Harry_V3.0.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "mn-chunky-lotus-12b"
+  icon: https://huggingface.co/FallenMerick/MN-Chunky-Lotus-12B/resolve/main/chunky-lotus.jpg
+  urls:
+    - https://huggingface.co/QuantFactory/MN-Chunky-Lotus-12B-GGUF
+  description: |
+    I had originally planned to use this model for future/further merges, but decided to go ahead and release it since it scored rather high on my local EQ Bench testing (79.58 w/ 100% parsed @ 8-bit).
+    Bear in mind that most models tend to score a bit higher on my own local tests as compared to their posted scores. Still, its the highest score I've personally seen from all the models I've tested.
+    Its a decent model, with great emotional intelligence and acceptable adherence to various character personalities. It does a good job at roleplaying despite being a bit bland at times.
+
+    Overall, I like the way it writes, but it has a few formatting issues that show up from time to time, and it has an uncommon tendency to paste walls of character feelings/intentions at the end of some outputs without any prompting. This is something I hope to correct with future iterations.
+    This is a merge of pre-trained language models created using mergekit.
+    The following models were included in the merge:
+        Epiculous/Violet_Twilight-v0.2
+        nbeerbower/mistral-nemo-gutenberg-12B-v4
+        flammenai/Mahou-1.5-mistral-nemo-12B
+  overrides:
+    parameters:
+      model: MN-Chunky-Lotus-12B.Q4_K_M.gguf
+  files:
+    - filename: MN-Chunky-Lotus-12B.Q4_K_M.gguf
+      sha256: 363defe0a769fdb715dab75517966a0a80bcdd981a610d4c759099b6c8ff143a
+      uri: huggingface://QuantFactory/MN-Chunky-Lotus-12B-GGUF/MN-Chunky-Lotus-12B.Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"

From 61839efed2d15c3c223f5b7a5802f55a28ced45b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 8 Dec 2024 18:01:25 +0100
Subject: [PATCH 47/89] chore(model gallery): add virtuoso-small (#4338)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 43f34430..4a307b88 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1838,6 +1838,20 @@
     - filename: Math-IIO-7B-Instruct.Q4_K_M.gguf
       sha256: 8ffda0b6a43eb9997dfd7db48fe3bd0970fd1b9b86fb68f082c38622a48b58f4
       uri: huggingface://QuantFactory/Math-IIO-7B-Instruct-GGUF/Math-IIO-7B-Instruct.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "virtuoso-small"
+  icon: https://i.ibb.co/pXD6Bcv/SW2-U-g-QQLSH1-ZAbxhs-Iu-A.webp
+  urls:
+    - https://huggingface.co/arcee-ai/Virtuoso-Small-GGUF
+  description: |
+    Virtuoso-Small is the debut public release of the Virtuoso series of models by Arcee.ai, designed to bring cutting-edge generative AI capabilities to organizations and developers in a compact, efficient form. With 14 billion parameters, Virtuoso-Small is an accessible entry point for high-quality instruction-following, complex reasoning, and business-oriented generative AI tasks.
+  overrides:
+    parameters:
+      model: Virtuoso-Small-Q4_K_M.gguf
+  files:
+    - filename: Virtuoso-Small-Q4_K_M.gguf
+      sha256: 07db215cdfcb05036567017fe20e50e60cb2da28d1f9a8251cc4f18c8caa247f
+      uri: huggingface://arcee-ai/Virtuoso-Small-GGUF/Virtuoso-Small-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From e147f1bd3eddbe4f8a24fec1a0b293fff5db2ad4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 8 Dec 2024 18:43:26 +0100
Subject: [PATCH 48/89] chore(model gallery): add bio-medical-llama-3-8b
 (#4339)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 4a307b88..b3ed3f90 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -8129,6 +8129,23 @@
     - filename: RP-Naughty-v1.0c-8b.Q4_K_M.gguf
       sha256: c344564d26d0c3d244d31cfeb103666eab37f9dee6678a2dbaf5bfcf4109d789
       uri: huggingface://QuantFactory/RP-Naughty-v1.0c-8b-GGUF/RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "bio-medical-llama-3-8b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/653f5b93cd52f288490edc83/zPMUugzfOiwTiRw88jm7T.jpeg
+  urls:
+    - https://huggingface.co/ContactDoctor/Bio-Medical-Llama-3-8B
+    - https://huggingface.co/QuantFactory/Bio-Medical-Llama-3-8B-GGUF
+  description: |
+    Bio-Medical-Llama-3-8B model is a specialized large language model designed for biomedical applications. It is finetuned from the meta-llama/Meta-Llama-3-8B-Instruct model using a custom dataset containing over 500,000 diverse entries. These entries include a mix of synthetic and manually curated data, ensuring high quality and broad coverage of biomedical topics.
+
+    The model is trained to understand and generate text related to various biomedical fields, making it a valuable tool for researchers, clinicians, and other professionals in the biomedical domain.
+  overrides:
+    parameters:
+      model: Bio-Medical-Llama-3-8B.Q4_K_M.gguf
+  files:
+    - filename: Bio-Medical-Llama-3-8B.Q4_K_M.gguf
+      sha256: 672939e0487d02c55734132c25a59f26e4deaac7cd49445a7028f2291139edcc
+      uri: huggingface://QuantFactory/Bio-Medical-Llama-3-8B-GGUF/Bio-Medical-Llama-3-8B.Q4_K_M.gguf
 - &command-R
   ### START Command-r
   url: "github:mudler/LocalAI/gallery/command-r.yaml@master"

From b5a21202ed81cf90dd59c7fee18b656173557148 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 8 Dec 2024 23:54:06 +0100
Subject: [PATCH 49/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `e52522b8694ae73abf12feb18d29168674aa1c1b` (#4342)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 786de811..c499119a 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=3573fa8e7b7f0865638b52b4e9b4d2006f0558a2
+CPPLLAMA_VERSION?=e52522b8694ae73abf12feb18d29168674aa1c1b
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From fb17e737f0dc4d176a1d7e6845453cb6ecd4e95c Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 9 Dec 2024 09:19:25 +0100
Subject: [PATCH 50/89] docs: :arrow_up: update docs version mudler/LocalAI
 (#4341)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index bb7517a1..f6462f81 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.24.0"
+  "version": "v2.24.1"
 }

From a9c0dd3a1e12841ed08b722f1c7e739f967afffa Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 9 Dec 2024 10:24:15 +0100
Subject: [PATCH 51/89] chore(model gallery): add
 qwen2.5-7b-homeranvita-nerdmix (#4343)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b3ed3f90..e163f72d 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1852,6 +1852,20 @@
     - filename: Virtuoso-Small-Q4_K_M.gguf
       sha256: 07db215cdfcb05036567017fe20e50e60cb2da28d1f9a8251cc4f18c8caa247f
       uri: huggingface://arcee-ai/Virtuoso-Small-GGUF/Virtuoso-Small-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-7b-homeranvita-nerdmix"
+  urls:
+    - https://huggingface.co/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix
+    - https://huggingface.co/QuantFactory/Qwen2.5-7B-HomerAnvita-NerdMix-GGUF
+  description: |
+    ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix is an advanced language model meticulously crafted by merging five pre-trained models using the powerful mergekit framework. This fusion leverages the Model Stock merge method to combine the creative prowess of Qandora, the instructive capabilities of Qwen-Instruct-Fusion, the sophisticated blending of HomerSlerp1, the mathematical precision of Cybertron-MGS, and the uncensored expertise of Qwen-Nerd. The resulting model excels in creative text generation, contextual understanding, technical reasoning, and dynamic conversational interactions.
+  overrides:
+    parameters:
+      model: Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+      sha256: 73db2ca3ab50e8627352078988cd173e7447c5e8199a7db9e554602da1362e5f
+      uri: huggingface://QuantFactory/Qwen2.5-7B-HomerAnvita-NerdMix-GGUF/Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 5eceb5f67ced52e51dd485c72c21eeb8cc8b43b6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 9 Dec 2024 10:24:30 +0100
Subject: [PATCH 52/89] chore(model gallery): add impish_mind_8b (#4344)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e163f72d..2f2f4c1b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3628,6 +3628,25 @@
     - filename: Loki-v2.6-8b-1024k.Q4_K_M.gguf
       sha256: 9b15c1fee0a0e6d6ed97df3d1b6fc8f774e6e1bd388328599e731c62e0f19d81
       uri: huggingface://QuantFactory/Loki-v2.6-8b-1024k-GGUF/Loki-v2.6-8b-1024k.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "impish_mind_8b"
+  icon: https://huggingface.co/SicariusSicariiStuff/Impish_Mind_8B/resolve/main/Images/Impish_Mind.png
+  urls:
+    - https://huggingface.co/SicariusSicariiStuff/Impish_Mind_8B
+    - https://huggingface.co/bartowski/Impish_Mind_8B-GGUF
+  description: |
+    This model was trained with new data and a new approach (compared to my other models). While it may be a bit more censored, it is expected to be significantly smarter. The data used is quite unique, and is also featuring long and complex markdown datasets.
+
+    Regarding censorship: Whether uncensoring or enforcing strict censorship, the model tends to lose some of its intelligence. The use of toxic data was kept to a minimum with this model.
+
+    Consequently, the model is likely to refuse some requests, this is easly avoidable with a basic system prompt, or assistant impersonation ("Sure thing!..."). Unlike many RP models, this one is designed to excel at general assistant tasks as well.
+  overrides:
+    parameters:
+      model: Impish_Mind_8B-Q4_K_M.gguf
+  files:
+    - filename: Impish_Mind_8B-Q4_K_M.gguf
+      sha256: 918f82bcb893c75fa2e846156df7bd3ce359464b960e32ae9171035ee14e7c51
+      uri: huggingface://bartowski/Impish_Mind_8B-GGUF/Impish_Mind_8B-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From f45d6c746ad5012cb5406c977608d9ad081c35b4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:58:29 +0100
Subject: [PATCH 53/89] chore(model gallery): add tulu-3.1-8b-supernova-smart
 (#4347)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 2f2f4c1b..f57b80c6 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3647,6 +3647,20 @@
     - filename: Impish_Mind_8B-Q4_K_M.gguf
       sha256: 918f82bcb893c75fa2e846156df7bd3ce359464b960e32ae9171035ee14e7c51
       uri: huggingface://bartowski/Impish_Mind_8B-GGUF/Impish_Mind_8B-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "tulu-3.1-8b-supernova-smart"
+  urls:
+    - https://huggingface.co/bunnycore/Tulu-3.1-8B-SuperNova-Smart
+    - https://huggingface.co/QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF
+  description: |
+    This model was merged using the passthrough merge method using bunnycore/Tulu-3.1-8B-SuperNova + bunnycore/Llama-3.1-8b-smart-lora as a base.
+  overrides:
+    parameters:
+      model: Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+  files:
+    - filename: Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+      sha256: 4b8ba9e64f0667199eee2dcc769f1a90aa9c7730165d42f440fdf107c7585c63
+      uri: huggingface://QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF/Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From a03a9b9e51ef95e598b9108bc6da593d1619ab5f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 20:09:26 +0000
Subject: [PATCH 54/89] chore(deps): Bump docs/themes/hugo-theme-relearn from
 `be85052` to `bd1f3d3` (#4348)

chore(deps): Bump docs/themes/hugo-theme-relearn

Bumps [docs/themes/hugo-theme-relearn](https://github.com/McShelby/hugo-theme-relearn) from `be85052` to `bd1f3d3`.
- [Release notes](https://github.com/McShelby/hugo-theme-relearn/releases)
- [Commits](https://github.com/McShelby/hugo-theme-relearn/compare/be85052efea3a0aaef45ecb0126d390c1bbac760...bd1f3d3432632c61bb12e7ec0f7673fed0289f19)

---
updated-dependencies:
- dependency-name: docs/themes/hugo-theme-relearn
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/themes/hugo-theme-relearn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/themes/hugo-theme-relearn b/docs/themes/hugo-theme-relearn
index be85052e..bd1f3d34 160000
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
@@ -1 +1 @@
-Subproject commit be85052efea3a0aaef45ecb0126d390c1bbac760
+Subproject commit bd1f3d3432632c61bb12e7ec0f7673fed0289f19

From 885118e863e24253b88bb3751f1963e8c34043de Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 10 Dec 2024 09:10:58 +0100
Subject: [PATCH 55/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `26a8406ba9198eb6fdd8329fa717555b4f77f05f` (#4353)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c499119a..f9a1a2db 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e52522b8694ae73abf12feb18d29168674aa1c1b
+CPPLLAMA_VERSION?=26a8406ba9198eb6fdd8329fa717555b4f77f05f
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 3aff87a5cfae23f1c1f40b162f1745fe018b98b1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 10 Dec 2024 09:42:24 +0100
Subject: [PATCH 56/89] chore(model gallery): add qwen2.5-math-14b-instruct
 (#4355)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index f57b80c6..08ef8bcb 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1866,6 +1866,21 @@
     - filename: Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
       sha256: 73db2ca3ab50e8627352078988cd173e7447c5e8199a7db9e554602da1362e5f
       uri: huggingface://QuantFactory/Qwen2.5-7B-HomerAnvita-NerdMix-GGUF/Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-math-14b-instruct"
+  urls:
+    - https://huggingface.co/qingy2024/Qwen2.5-Math-14B-Instruct-Preview
+    - https://huggingface.co/QuantFactory/Qwen2.5-Math-14B-Instruct-GGUF
+  description: |
+    This Qwen 2.5 model was trained 2x faster with Unsloth and Huggingface's TRL library.
+    Fine-tuned it for 400 steps on garage-bAInd/Open-Platypus with a batch size of 3.
+  overrides:
+    parameters:
+      model: Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+      sha256: 14e672394738a7d9f14a6cb16fd9a649b113a19a8b4934f9c18299fc4e286ab6
+      uri: huggingface://QuantFactory/Qwen2.5-Math-14B-Instruct-GGUF/Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 272763f625a6db7e064504074677d77676d7e941 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 10 Dec 2024 09:42:37 +0100
Subject: [PATCH 57/89] chore(model gallery): add intellect-1-instruct (#4356)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 08ef8bcb..a73500d7 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,29 @@
 ---
+- &intellect1
+  name: "intellect-1-instruct"
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://huggingface.co/PrimeIntellect/INTELLECT-1-Instruct/resolve/main/intellect-1-map.png
+  urls:
+    - https://huggingface.co/PrimeIntellect/INTELLECT-1-Instruct
+    - https://huggingface.co/bartowski/INTELLECT-1-Instruct-GGUF
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - intellect
+  license: apache-2.0
+  description: |
+    INTELLECT-1 is the first collaboratively trained 10 billion parameter language model trained from scratch on 1 trillion tokens of English text and code.
+    This is an instruct model. The base model associated with it is INTELLECT-1.
+    INTELLECT-1 was trained on up to 14 concurrent nodes distributed across 3 continents, with contributions from 30 independent community contributors providing compute. The training code utilizes the prime framework, a scalable distributed training framework designed for fault-tolerant, dynamically scaling, high-perfomance training on unreliable, globally distributed workers. The key abstraction that allows dynamic scaling is the ElasticDeviceMesh which manages dynamic global process groups for fault-tolerant communication across the internet and local process groups for communication within a node. The model was trained using the DiLoCo algorithms with 100 inner steps. The global all-reduce was done with custom int8 all-reduce kernels to reduce the communication payload required, greatly reducing the communication overhead by a factor 400x.
+  overrides:
+    parameters:
+      model: INTELLECT-1-Instruct-Q4_K_M.gguf
+  files:
+    - filename: INTELLECT-1-Instruct-Q4_K_M.gguf
+      sha256: 5df236fe570e5998d07fb3207788eac811ef3b77dd2a0ad04a2ef5c6361f3030
+      uri: huggingface://bartowski/INTELLECT-1-Instruct-GGUF/INTELLECT-1-Instruct-Q4_K_M.gguf
 - &llama33
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png

From de1ddb8ba69ed6c55ba01d06f15572e1423dd8f7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 10 Dec 2024 09:42:47 +0100
Subject: [PATCH 58/89] chore(model gallery): add b-nimita-l3-8b-v0.02 (#4357)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index a73500d7..b8eedc53 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3701,6 +3701,20 @@
     - filename: Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
       sha256: 4b8ba9e64f0667199eee2dcc769f1a90aa9c7730165d42f440fdf107c7585c63
       uri: huggingface://QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF/Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "b-nimita-l3-8b-v0.02"
+  urls:
+    - https://huggingface.co/Arkana08/B-NIMITA-L3-8B-v0.02
+    - https://huggingface.co/QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF
+  description: |
+    B-NIMITA is an AI model designed to bring role-playing scenarios to life with emotional depth and rich storytelling. At its core is NIHAPPY, providing a solid narrative foundation and contextual consistency. This is enhanced by Mythorica, which adds vivid emotional arcs and expressive dialogue, and V-Blackroot, ensuring character consistency and subtle adaptability. This combination allows B-NIMITA to deliver dynamic, engaging interactions that feel natural and immersive.
+  overrides:
+    parameters:
+      model: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+  files:
+    - filename: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+      sha256: 625a54848dcd3f23bc06b639a7dfecae14142b5d177dd45acfe7724816bab4cd
+      uri: huggingface://QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF/B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From b74a936178a97d6944d5fff73cd193b691b6c06e Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 10 Dec 2024 22:45:42 +0100
Subject: [PATCH 59/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `dafae66cc242eb766797194d3c85c5e502625623` (#4360)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f9a1a2db..36c7be21 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=26a8406ba9198eb6fdd8329fa717555b4f77f05f
+CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From ec239a0cd0cd5ce321d8e49c28a2bf1a46597331 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:04:16 +0100
Subject: [PATCH 60/89] docs: :arrow_up: update docs version mudler/LocalAI
 (#4359)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index f6462f81..bf065426 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.24.1"
+  "version": "v2.24.2"
 }

From 1918efdfdd08d96f732a8f7e7d42060b56d8c2e5 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:32:18 +0100
Subject: [PATCH 61/89] chore(model gallery): add sailor2-1b-chat (#4363)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b8eedc53..b63520c6 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1906,6 +1906,22 @@
     - filename: Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
       sha256: 14e672394738a7d9f14a6cb16fd9a649b113a19a8b4934f9c18299fc4e286ab6
       uri: huggingface://QuantFactory/Qwen2.5-Math-14B-Instruct-GGUF/Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "sailor2-1b-chat"
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  urls:
+    - https://huggingface.co/sail/Sailor2-1B-Chat
+    - https://huggingface.co/bartowski/Sailor2-1B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-1B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-1B-Chat-Q4_K_M.gguf
+      sha256: 782e8abed13d51a2083eadfb2f6d94c2cd77940532f612a99e6f6bec9b3501d4
+      uri: huggingface://bartowski/Sailor2-1B-Chat-GGUF/Sailor2-1B-Chat-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 75b283d83c3acdda5156e90281e5cfadabb1b39c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:51:39 +0100
Subject: [PATCH 62/89] chore(model gallery): add sailor2-8b-chat (#4364)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b63520c6..40394f6a 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1922,6 +1922,21 @@
     - filename: Sailor2-1B-Chat-Q4_K_M.gguf
       sha256: 782e8abed13d51a2083eadfb2f6d94c2cd77940532f612a99e6f6bec9b3501d4
       uri: huggingface://bartowski/Sailor2-1B-Chat-GGUF/Sailor2-1B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  name: "sailor2-8b-chat"
+  urls:
+    - https://huggingface.co/bartowski/Sailor2-8B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-8B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-8B-Chat-Q4_K_M.gguf
+      sha256: 1a6aaadd6f6ef9c2290d66b348ebcbd6fdec542834cde622498fbd467d966103
+      uri: huggingface://bartowski/Sailor2-8B-Chat-GGUF/Sailor2-8B-Chat-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From c85f46a71dbf184c4a391456ab87af104ef5dab9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 11 Dec 2024 10:55:04 +0100
Subject: [PATCH 63/89] chore(model gallery): add sailor2-20b-chat (#4365)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 40394f6a..37664dd8 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1937,6 +1937,21 @@
     - filename: Sailor2-8B-Chat-Q4_K_M.gguf
       sha256: 1a6aaadd6f6ef9c2290d66b348ebcbd6fdec542834cde622498fbd467d966103
       uri: huggingface://bartowski/Sailor2-8B-Chat-GGUF/Sailor2-8B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "sailor2-20b-chat"
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  urls:
+    - https://huggingface.co/bartowski/Sailor2-20B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-20B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-20B-Chat-Q4_K_M.gguf
+      sha256: 0cf8fcd367accee19702ef15ee964bddd5035bde034afddd838f818e7655534a
+      uri: huggingface://bartowski/Sailor2-20B-Chat-GGUF/Sailor2-20B-Chat-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From f2cb261797d587bc63d33defd17a4394fa4a0361 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 12 Dec 2024 09:23:36 +0100
Subject: [PATCH 64/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `235f6e14bf0ed0211c51aeff14139038ae1000aa` (#4366)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 36c7be21..1dd5d18e 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623
+CPPLLAMA_VERSION?=235f6e14bf0ed0211c51aeff14139038ae1000aa
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 3ab83e91df2678cab49d254559fec41fc8794706 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:07:41 +0100
Subject: [PATCH 65/89] chore(model gallery): add 72b-qwen2.5-kunou-v1 (#4369)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 37664dd8..75c987c0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1952,6 +1952,27 @@
     - filename: Sailor2-20B-Chat-Q4_K_M.gguf
       sha256: 0cf8fcd367accee19702ef15ee964bddd5035bde034afddd838f818e7655534a
       uri: huggingface://bartowski/Sailor2-20B-Chat-GGUF/Sailor2-20B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "72b-qwen2.5-kunou-v1"
+  icon: https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1/resolve/main/knn.png
+  urls:
+    - https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1
+    - https://huggingface.co/bartowski/72B-Qwen2.5-Kunou-v1-GGUF
+  description: |
+    I do not really have anything planned for this model other than it being a generalist, and Roleplay Model? It was just something made and planned in minutes.
+    Same with the 14 and 32B version.
+    Kunou's the name of an OC I worked on for a couple of years, for a... fanfic. mmm...
+
+    A kind-of successor to L3-70B-Euryale-v2.2 in all but name? I'm keeping Stheno/Euryale lineage to Llama series for now.
+    I had a version made on top of Nemotron, a supposed Euryale 2.4 but that flopped hard, it was not my cup of tea.
+    This version is basically a better, more cleaned up Dataset used on Euryale and Stheno.
+  overrides:
+    parameters:
+      model: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+  files:
+    - filename: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+      sha256: 91907f29746625a62885793475956220b81d8a5a34b53686a1acd1d03fd403ea
+      uri: huggingface://bartowski/72B-Qwen2.5-Kunou-v1-GGUF/72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From b8824f2ad928ee518e35a633df9a085d2648d926 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:07:57 +0100
Subject: [PATCH 66/89] chore(model gallery): add
 deepthought-8b-llama-v0.01-alpha (#4370)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 75c987c0..cb104908 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3782,6 +3782,20 @@
     - filename: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
       sha256: 625a54848dcd3f23bc06b639a7dfecae14142b5d177dd45acfe7724816bab4cd
       uri: huggingface://QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF/B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "deepthought-8b-llama-v0.01-alpha"
+  urls:
+    - https://huggingface.co/ruliad/deepthought-8b-llama-v0.01-alpha
+    - https://huggingface.co/bartowski/deepthought-8b-llama-v0.01-alpha-GGUF
+  description: |
+    Deepthought-8B is a small and capable reasoning model built on LLaMA-3.1 8B, designed to make AI reasoning more transparent and controllable. Despite its relatively small size, it achieves sophisticated reasoning capabilities that rival much larger models.
+  overrides:
+    parameters:
+      model: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+  files:
+    - filename: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+      sha256: 33195ba7b898ef8b2997d095e8be42adf1d0e1f6e8291cf07e026fc8e45903fd
+      uri: huggingface://bartowski/deepthought-8b-llama-v0.01-alpha-GGUF/deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From 1854b8c612bac4c2c04a64632158c66d3818945c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:22:48 +0100
Subject: [PATCH 67/89] chore(model gallery): add l3.3-70b-euryale-v2.3 (#4371)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index cb104908..11408635 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -47,6 +47,21 @@
     - filename: Llama-3.3-70B-Instruct.Q4_K_M.gguf
       sha256: 4f3b04ecae278bdb0fd545b47c210bc5edf823e5ebf7d41e0b526c81d54b1ff3
       uri: huggingface://MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-70b-euryale-v2.3"
+  icon: https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3/resolve/main/Eury.png
+  urls:
+    - https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3
+    - https://huggingface.co/bartowski/L3.3-70B-Euryale-v2.3-GGUF
+  description: |
+    A direct replacement / successor to Euryale v2.2, not Hanami-x1, though it is slightly better than them in my opinion.
+  overrides:
+    parameters:
+      model: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+  files:
+    - filename: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+      sha256: 4e78bb0e65886bfcff89b829f6d38aa6f6846988bb8291857e387e3f60b3217b
+      uri: huggingface://bartowski/L3.3-70B-Euryale-v2.3-GGUF/L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From 37527420de67ffdb23057e6735e351d4d72f06e1 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Thu, 12 Dec 2024 22:44:54 +0100
Subject: [PATCH 68/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `274ec65af6e54039eb95cb44904af5c945dca1fa` (#4372)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1dd5d18e..2c7d0259 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=235f6e14bf0ed0211c51aeff14139038ae1000aa
+CPPLLAMA_VERSION?=274ec65af6e54039eb95cb44904af5c945dca1fa
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 044570fa85f99a21769de1f2ee9a56db0aa5ca53 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Dec 2024 09:50:41 +0100
Subject: [PATCH 69/89] chore(model gallery): add l3.3-ms-evayale-70b (#4374)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 11408635..cccf1138 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -62,6 +62,21 @@
     - filename: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
       sha256: 4e78bb0e65886bfcff89b829f6d38aa6f6846988bb8291857e387e3f60b3217b
       uri: huggingface://bartowski/L3.3-70B-Euryale-v2.3-GGUF/L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-ms-evayale-70b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/HFCaVzRpiE05Y46p41qRy.webp
+  urls:
+    - https://huggingface.co/Steelskull/L3.3-MS-Evayale-70B
+    - https://huggingface.co/bartowski/L3.3-MS-Evayale-70B-GGUF
+  description: |
+    This model was created as I liked the storytelling of EVA but the prose and details of scenes from EURYALE, my goal is to merge the robust storytelling of both models while attempting to maintain the positives of both models.
+  overrides:
+    parameters:
+      model: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+  files:
+    - filename: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+      sha256: f941d88870fec8343946517a1802d159d23f3971eeea50b6cf12295330bd29cc
+      uri: huggingface://bartowski/L3.3-MS-Evayale-70B-GGUF/L3.3-MS-Evayale-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From 73f1f25b9a4cdd19b107d892df84a7a24f4937f3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Dec 2024 09:51:13 +0100
Subject: [PATCH 70/89] chore(model gallery): add evathene-v1.3 (#4375)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index cccf1138..f3e428b7 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2003,6 +2003,21 @@
     - filename: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
       sha256: 91907f29746625a62885793475956220b81d8a5a34b53686a1acd1d03fd403ea
       uri: huggingface://bartowski/72B-Qwen2.5-Kunou-v1-GGUF/72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  icon: https://i.imgur.com/OxX2Usi.png
+  name: "evathene-v1.3"
+  urls:
+    - https://huggingface.co/sophosympatheia/Evathene-v1.3
+    - https://huggingface.co/bartowski/Evathene-v1.3-GGUF
+  description: |
+    This 72B parameter model is a merge of sophosympatheia/Evathene-v1.1 and sophosympatheia/Evathene-v1.2. See the merge recipe below for details.
+  overrides:
+    parameters:
+      model: Evathene-v1.3-Q4_K_M.gguf
+  files:
+    - filename: Evathene-v1.3-Q4_K_M.gguf
+      sha256: 0f54909b3ddca514994ee16417da8750f56e7bd59581b46ac47625c230e29d1f
+      uri: huggingface://bartowski/Evathene-v1.3-GGUF/Evathene-v1.3-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 0429e007463b8ee37b7ac642a417b911c3365b71 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 13 Dec 2024 09:51:26 +0100
Subject: [PATCH 71/89] chore(model gallery): add hermes-3-llama-3.2-3b (#4376)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index f3e428b7..2997230e 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3074,6 +3074,22 @@
     - filename: hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
       sha256: 8cff9d399a0583616fe1f290da6daa091ab5c5493d0e173a8fffb45202d79417
       uri: huggingface://mlabonne/Hermes-3-Llama-3.1-8B-lorablated-GGUF/hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "hermes-3-llama-3.2-3b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.2-3B
+    - https://huggingface.co/bartowski/Hermes-3-Llama-3.2-3B-GGUF
+  description: |
+    Hermes 3 3B is a small but mighty new addition to the Hermes series of LLMs by Nous Research, and is Nous's first fine-tune in this parameter class.
+    Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board.
+  overrides:
+    parameters:
+      model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  files:
+    - filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+      sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+      uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
 - !!merge <<: *llama31
   name: "doctoraifinetune-3.1-8b-i1"
   urls:

From fc4a714992e44b68c81b1270e8723a72de97f06e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 14 Dec 2024 00:30:52 +0100
Subject: [PATCH 72/89] feat(llama.cpp): bump and adapt to upstream changes
 (#4378)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                          |  2 +-
 backend/cpp/llama/grpc-server.cpp | 33 +++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 2c7d0259..2645ddd0 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=274ec65af6e54039eb95cb44904af5c945dca1fa
+CPPLLAMA_VERSION?=c27ac678dd393af0da9b8acf10266e760c8a0912
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index ea5c4e34..d553d35d 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2228,6 +2228,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 static void params_parse(const backend::ModelOptions* request,
                                 common_params & params) {
    
@@ -2242,10 +2271,10 @@ static void params_parse(const backend::ModelOptions* request,
     //  params.model_alias ??
     params.model_alias =  request->modelfile();
     if (!request->cachetypekey().empty()) {
-        params.cache_type_k = request->cachetypekey();
+        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
     }
     if (!request->cachetypevalue().empty()) {
-        params.cache_type_v = request->cachetypevalue();
+        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
     }
     params.n_ctx = request->contextsize();
     //params.memory_f16 = request->f16memory();

From 5051074845880c36bdaf1a5e3a6b9bb33223b6ae Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 14 Dec 2024 11:26:40 +0100
Subject: [PATCH 73/89] chore(model gallery): add fusechat-gemma-2-9b-instruct
 (#4379)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 2997230e..90564eae 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5479,6 +5479,21 @@
     - filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
       sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319
       uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "fusechat-gemma-2-9b-instruct"
+  icon: "https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct/resolve/main/FuseChat-3.0.png"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+      sha256: f5aef201be68f344bebff3433af87aac6428fd227adfd7e468c8bfbcf9660ece
+      uri: huggingface://bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF/FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png

From af33483687affc05cfaea69db2b8846efd63f6bc Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 14 Dec 2024 11:27:11 +0100
Subject: [PATCH 74/89] chore(model gallery): add fusechat-qwen-2.5-7b-instruct
 (#4380)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 90564eae..9b1f5ea6 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2018,6 +2018,21 @@
     - filename: Evathene-v1.3-Q4_K_M.gguf
       sha256: 0f54909b3ddca514994ee16417da8750f56e7bd59581b46ac47625c230e29d1f
       uri: huggingface://bartowski/Evathene-v1.3-GGUF/Evathene-v1.3-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "fusechat-qwen-2.5-7b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    -https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+      sha256: 8cd8c317769f03125ac753c836ac92c5a76ee0b35502811d0e65bcbb8df9d55c
+      uri: huggingface://bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF/FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 432c31d90419230e2b80711a4a34a054a978fcca Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 14 Dec 2024 11:27:25 +0100
Subject: [PATCH 75/89] chore(model gallery): add chronos-gold-12b-1.0 (#4381)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 9b1f5ea6..dcf8df58 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4706,6 +4706,32 @@
     - filename: MN-Chunky-Lotus-12B.Q4_K_M.gguf
       sha256: 363defe0a769fdb715dab75517966a0a80bcdd981a610d4c759099b6c8ff143a
       uri: huggingface://QuantFactory/MN-Chunky-Lotus-12B-GGUF/MN-Chunky-Lotus-12B.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "chronos-gold-12b-1.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/630417380907b9a115c6aa9f/3hc8zt8fzKdO3qHK1p1mW.webp
+  urls:
+    - https://huggingface.co/elinas/Chronos-Gold-12B-1.0
+    - https://huggingface.co/mradermacher/Chronos-Gold-12B-1.0-GGUF
+  description: |
+    Chronos Gold 12B 1.0 is a very unique model that applies to domain areas such as general chatbot functionatliy, roleplay, and storywriting. The model has been observed to write up to 2250 tokens in a single sequence. The model was trained at a sequence length of 16384 (16k) and will still retain the apparent 128k context length from Mistral-Nemo, though it deteriorates over time like regular Nemo does based on the RULER Test
+
+    As a result, is recommended to keep your sequence length max at 16384, or you will experience performance degredation.
+
+    The base model is mistralai/Mistral-Nemo-Base-2407 which was heavily modified to produce a more coherent model, comparable to much larger models.
+
+    Chronos Gold 12B-1.0 re-creates the uniqueness of the original Chronos with significiantly enhanced prompt adherence (following), coherence, a modern dataset, as well as supporting a majority of "character card" formats in applications like SillyTavern.
+
+    It went through an iterative and objective merge process as my previous models and was further finetuned on a dataset curated for it.
+
+    The specifics of the model will not be disclosed at the time due to dataset ownership.
+  overrides:
+    parameters:
+      model: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+  files:
+    - filename: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+      sha256: d75a6ed28781f0ea6fa6e58c0b25dfecdd160d4cab64aaf511ea156e99a1e1f3
+      uri: huggingface://mradermacher/Chronos-Gold-12B-1.0-GGUF/Chronos-Gold-12B-1.0.Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"

From 59cbf38b4b52a807097d9b88f9e570706f487f07 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 14 Dec 2024 21:21:27 +0100
Subject: [PATCH 76/89] fix(gallery): correct syntax typo

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 gallery/index.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index dcf8df58..89569cc4 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2022,7 +2022,7 @@
   name: "fusechat-qwen-2.5-7b-instruct"
   icon: https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/resolve/main/FuseChat-3.0.png
   urls:
-    -https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct
+    - https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct
     - https://huggingface.co/bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF
   description: |
     We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.

From cca911f3e50ff36b0625cd43b6865198a3594a3d Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 15 Dec 2024 09:59:20 +0100
Subject: [PATCH 78/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `e52aba537a34d51a65cddec6bc6dafc9031edc63` (#4385)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2645ddd0..255d6071 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=c27ac678dd393af0da9b8acf10266e760c8a0912
+CPPLLAMA_VERSION?=e52aba537a34d51a65cddec6bc6dafc9031edc63
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 8f2be8266700788acc92c9e8dccb7acc45daebfc Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 15 Dec 2024 10:07:30 +0100
Subject: [PATCH 79/89] chore(model gallery): add
 fusechat-llama-3.2-3b-instruct (#4386)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 89569cc4..2df138ff 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -807,6 +807,20 @@
     - filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
       sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e
       uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "fusechat-llama-3.2-3b-instruct"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Llama-3.2-3B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+      sha256: a4f0e9a905b74886b79b72622c06a3219d6812818a564a53c39fc49032d7f842
+      uri: huggingface://bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF/FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
 - &qwen25
   ## Qwen2.5
   name: "qwen2.5-14b-instruct"

From 1d6d30137009cb4fb9c6d3971900c1919baea690 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 15 Dec 2024 10:07:42 +0100
Subject: [PATCH 80/89] chore(model gallery): add
 fusechat-llama-3.1-8b-instruct (#4387)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 2df138ff..b729dfe2 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -3886,6 +3886,21 @@
     - filename: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
       sha256: 33195ba7b898ef8b2997d095e8be42adf1d0e1f6e8291cf07e026fc8e45903fd
       uri: huggingface://bartowski/deepthought-8b-llama-v0.01-alpha-GGUF/deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "fusechat-llama-3.1-8b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Llama-3.1-8B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+      sha256: fe58c8c9b695e36e6b0ee5e4d81ff71ea0a4f1a11fa7bb16e8d6f1b35a58dff6
+      uri: huggingface://bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF/FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"

From 9429a53db7162e798795b1d73dfdf4e055a8c899 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 15 Dec 2024 10:07:56 +0100
Subject: [PATCH 81/89] chore(model gallery): add neumind-math-7b-instruct
 (#4388)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index b729dfe2..71a2d56f 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2047,6 +2047,20 @@
     - filename: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
       sha256: 8cd8c317769f03125ac753c836ac92c5a76ee0b35502811d0e65bcbb8df9d55c
       uri: huggingface://bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF/FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "neumind-math-7b-instruct"
+  urls:
+    - https://huggingface.co/prithivMLmods/Neumind-Math-7B-Instruct
+    - https://huggingface.co/QuantFactory/Neumind-Math-7B-Instruct-GGUF
+  description: |
+    The Neumind-Math-7B-Instruct is a fine-tuned model based on Qwen2.5-7B-Instruct, optimized for mathematical reasoning, step-by-step problem-solving, and instruction-based tasks in the mathematics domain. The model is designed for applications requiring structured reasoning, numerical computations, and mathematical proof generation.
+  overrides:
+    parameters:
+      model: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+      sha256: 3250abadeae4234e06dfaf7cf86fe871fe021e6c2dfcb4542c2a4f412d71e28c
+      uri: huggingface://QuantFactory/Neumind-Math-7B-Instruct-GGUF/Neumind-Math-7B-Instruct.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:

From 5d9c530eaa38a5c470f9a766e29e617b401432ce Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 15 Dec 2024 18:43:39 +0100
Subject: [PATCH 82/89] fix(gallery): disable default embeddings

Do not always enable embeddings on llama32, but let specific models settings

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 gallery/index.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 71a2d56f..35febd56 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -458,7 +458,6 @@
   urls:
     - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
   overrides:
-    embeddings: true
     parameters:
       model: llama-3.2-1b-instruct-q4_k_m.gguf
   files:
@@ -9708,6 +9707,10 @@
       llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings
   tags:
     - embeddings
+  overrides:
+    embeddings: true
+    parameters:
+      model: llama-3.2-1b-instruct-q4_k_m.gguf
 ## Stable Diffusion
 - url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master
   license: "BSD-3"

From 6938618e30fbbc3858ad951cc9afc5a04f3d5415 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 15 Dec 2024 23:01:44 +0100
Subject: [PATCH 83/89] chore: :arrow_up: Update ggerganov/llama.cpp to
 `a0974156f334acf8af5858d7ede5ab7d7490d415` (#4391)

:arrow_up: Update ggerganov/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 255d6071..9310e264 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e52aba537a34d51a65cddec6bc6dafc9031edc63
+CPPLLAMA_VERSION?=a0974156f334acf8af5858d7ede5ab7d7490d415
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From b40d5d12b720f0ee8b4f6aa391b9dfff4d46e3b1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 16 Dec 2024 09:47:49 +0100
Subject: [PATCH 84/89] chore(model gallery): add naturallm-7b-instruct (#4392)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 35febd56..fbd419b3 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4774,6 +4774,22 @@
     - filename: Chronos-Gold-12B-1.0.Q4_K_M.gguf
       sha256: d75a6ed28781f0ea6fa6e58c0b25dfecdd160d4cab64aaf511ea156e99a1e1f3
       uri: huggingface://mradermacher/Chronos-Gold-12B-1.0-GGUF/Chronos-Gold-12B-1.0.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "naturallm-7b-instruct"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  urls:
+    - https://huggingface.co/qingy2024/NaturalLM-7B-Instruct
+    - https://huggingface.co/bartowski/NaturalLM-7B-Instruct-GGUF
+  description: |
+    This Mistral 7B fine-tune is trained (for 150 steps) to talk like a human, not a "helpful assistant"!
+    It's also very beta right now. The dataset (qingy2024/Natural-Text-ShareGPT) can definitely be improved.
+  overrides:
+    parameters:
+      model: NaturalLM-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: NaturalLM-7B-Instruct-Q4_K_M.gguf
+      sha256: 15b2f34116f690fea35790a9392b8a2190fe25827e370d426e88a2a543f4dcee
+      uri: huggingface://bartowski/NaturalLM-7B-Instruct-GGUF/NaturalLM-7B-Instruct-Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"

From 472d11f8844e8f04d2a8cb6f8b1ed580f0ca3bc1 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 16 Dec 2024 09:48:23 +0100
Subject: [PATCH 85/89] chore(model gallery): add marco-o1-uncensored (#4393)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index fbd419b3..9d5696c2 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -4299,6 +4299,20 @@
     - filename: Marco-o1.Q4_K_M.gguf
       sha256: 54dd9554cb54609bf0bf4b367dfba192fc982a2fc6b87a0f56fba5ea82762d0d
       uri: huggingface://QuantFactory/Marco-o1-GGUF/Marco-o1.Q4_K_M.gguf
+- !!merge <<: *qwen2
+  name: "marco-o1-uncensored"
+  urls:
+    - https://huggingface.co/thirdeyeai/marco-o1-uncensored
+    - https://huggingface.co/QuantFactory/marco-o1-uncensored-GGUF
+  description: |
+    Uncensored version of marco-o1
+  overrides:
+    parameters:
+      model: marco-o1-uncensored.Q4_K_M.gguf
+  files:
+    - filename: marco-o1-uncensored.Q4_K_M.gguf
+      sha256: ad0440270a7254098f90779744d3e5b34fe49b7baf97c819909ba9c5648cc0d9
+      uri: huggingface://QuantFactory/marco-o1-uncensored-GGUF/marco-o1-uncensored.Q4_K_M.gguf
 - &mistral03
   ## START Mistral
   url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"

From 037e8030bf8d2ee23e3f66072cdd93a884b8965b Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 16 Dec 2024 09:48:33 +0100
Subject: [PATCH 86/89] chore(model gallery): add qwen2-7b-multilingual-rp
 (#4394)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 9d5696c2..99c0e9a3 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2113,6 +2113,20 @@
     - filename: Arch-Function-3B.Q4_K_M.gguf
       sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
       uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2-7b-multilingual-rp"
+  urls:
+    - https://huggingface.co/maywell/Qwen2-7B-Multilingual-RP
+    - https://huggingface.co/QuantFactory/Qwen2-7B-Multilingual-RP-GGUF
+  description: |
+    Multilingual Qwen2-7B model trained on Roleplaying.
+  overrides:
+    parameters:
+      model: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+  files:
+    - filename: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+      sha256: 31756c58fd135f2deb59b2d9b142f39134dc8d1a6eaa02f388dda7491fc95ccc
+      uri: huggingface://QuantFactory/Qwen2-7B-Multilingual-RP-GGUF/Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
 - &smollm
   ## SmolLM
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master"

From 7ca0e2d925b48eaedc945ef53f19de38fb43f049 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 16 Dec 2024 10:55:02 +0100
Subject: [PATCH 87/89] fix(python): remove pin to setuptools, pin python
 version (#4395)

fix(setuptools): remove pin

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/autogptq/requirements-intel.txt              | 2 +-
 backend/python/bark/requirements-intel.txt                  | 2 +-
 backend/python/common/libbackend.sh                         | 5 ++++-
 backend/python/coqui/requirements-intel.txt                 | 2 +-
 backend/python/diffusers/requirements-intel.txt             | 2 +-
 backend/python/openvoice/requirements.txt                   | 1 +
 backend/python/parler-tts/requirements-intel.txt            | 1 -
 backend/python/parler-tts/requirements.txt                  | 1 +
 backend/python/rerankers/requirements-intel.txt             | 2 +-
 backend/python/sentencetransformers/requirements-intel.txt  | 2 +-
 backend/python/transformers-musicgen/requirements-intel.txt | 2 +-
 backend/python/transformers/requirements.txt                | 2 +-
 backend/python/vall-e-x/requirements-intel.txt              | 3 +--
 backend/python/vall-e-x/requirements.txt                    | 3 ++-
 backend/python/vllm/requirements-intel.txt                  | 2 +-
 15 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/backend/python/autogptq/requirements-intel.txt b/backend/python/autogptq/requirements-intel.txt
index d5e0173e..cec8bff4 100644
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt
index c0e4dcaa..1f043bbf 100644
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index 934b1fd3..6013cf76 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,6 +17,9 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+
+PYTHON_VERSION="3.10"
+
 function init() {
     # Name of the backend (directory name)
     BACKEND_NAME=${PWD##*/}
@@ -88,7 +91,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
     if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${EDIR}/venv
+        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
         echo "virtualenv created"
     fi
 
diff --git a/backend/python/coqui/requirements-intel.txt b/backend/python/coqui/requirements-intel.txt
index de3b4ee4..7ed2fb42 100644
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
 coqui-tts
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-intel.txt b/backend/python/diffusers/requirements-intel.txt
index 566278a8..bd6632bf 100644
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 diffusers
 opencv-python
 transformers
diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
index 6806d3e1..e6a1e5a5 100644
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -18,3 +18,4 @@ jieba==0.42.1
 gradio==3.48.0
 langid==1.1.6
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements-intel.txt b/backend/python/parler-tts/requirements-intel.txt
index c0e4dcaa..bcb8900e 100644
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,5 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
index 75ea8a59..faf4ea3d 100644
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,3 +1,4 @@
 grpcio==1.68.1
 certifi
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/rerankers/requirements-intel.txt b/backend/python/rerankers/requirements-intel.txt
index e6bb4cc7..a3cc600c 100644
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements-intel.txt b/backend/python/sentencetransformers/requirements-intel.txt
index 56e17446..23e0d5f2 100644
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 accelerate
 sentence-transformers==3.3.1
 transformers
\ No newline at end of file
diff --git a/backend/python/transformers-musicgen/requirements-intel.txt b/backend/python/transformers-musicgen/requirements-intel.txt
index 608d6939..bb191163 100644
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
index b556b9f1..d981fd99 100644
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
 grpcio==1.68.1
 protobuf
 certifi
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements-intel.txt b/backend/python/vall-e-x/requirements-intel.txt
index adbabeac..284e7131 100644
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -3,5 +3,4 @@ intel-extension-for-pytorch
 accelerate
 torch
 torchaudio
-optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+optimum[openvino]
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
index 8e4eabf1..d981fd99 100644
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,4 @@
 grpcio==1.68.1
 protobuf
-certifi
\ No newline at end of file
+certifi
+setuptools
\ No newline at end of file
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
index 95443368..36326f95 100644
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,5 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 bitsandbytes
\ No newline at end of file

From 24abf568cbedfd438f48c9b7c5af6479473a6d1f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 17 Dec 2024 00:46:48 +0100
Subject: [PATCH 88/89] chore(tests): stabilize tts test (#4417)

chore(tests): stabilize test

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/app_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/http/app_test.go b/core/http/app_test.go
index 34ebacf7..7c57ba21 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -704,7 +704,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
 
 			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
-			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+			Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave")))
 		})
 		It("installs and is capable to generate images", Label("stablediffusion"), func() {
 			if runtime.GOOS != "linux" {

From 708cba0c1bf5b8068a0eb4a18994b2c187136a2f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 17 Dec 2024 00:47:52 +0100
Subject: [PATCH 89/89] chore(llama.cpp): bump, drop penalize_nl (#4418)

deps(llama.cpp): bump, drop penalize_nl

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                          | 2 +-
 backend/cpp/llama/grpc-server.cpp | 7 +------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 9310e264..4226c5d7 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=a0974156f334acf8af5858d7ede5ab7d7490d415
+CPPLLAMA_VERSION?=08ea539df211e46bb4d0dd275e541cb591d5ebc8
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index d553d35d..98dd8fde 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -681,7 +681,6 @@ struct llama_server_context
         slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
         slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
         slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
         slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
         slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -1213,13 +1212,12 @@ struct llama_server_context
             {"mirostat",          slot.sparams.mirostat},
             {"mirostat_tau",      slot.sparams.mirostat_tau},
             {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
             {"stop",              slot.params.antiprompt},
             {"n_predict",         slot.params.n_predict},
             {"n_keep",            params.n_keep},
             {"ignore_eos",        slot.sparams.ignore_eos},
             {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+             //      {"logit_bias",        slot.sparams.logit_bias},
             {"n_probs",           slot.sparams.n_probs},
             {"min_keep",          slot.sparams.min_keep},
             {"grammar",           slot.sparams.grammar},
@@ -2112,7 +2110,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
     //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
     //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
     //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
     //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
     //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2135,7 +2132,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     data["mirostat"] = predict->mirostat();
     data["mirostat_tau"] = predict->mirostattau();
     data["mirostat_eta"] = predict->mirostateta();
-    data["penalize_nl"] = predict->penalizenl();
     data["n_keep"] = predict->nkeep();
     data["seed"] = predict->seed();
     data["grammar"] = predict->grammar();
@@ -2181,7 +2177,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
-//     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();