diff --git a/.github/labeler.yml b/.github/labeler.yml
index 687a90d1..23e64566 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,6 +1,11 @@
 enhancements:
  - head-branch: ['^feature', 'feature']
 
+dependencies:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'Makefile'
+
 kind/documentation:
 - any:
   - changed-files:
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index c94a134d..092110df 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,23 +12,14 @@ jobs:
           - repository: "ggerganov/llama.cpp"
             variable: "CPPLLAMA_VERSION"
             branch: "master"
-          - repository: "go-skynet/go-ggml-transformers.cpp"
-            variable: "GOGGMLTRANSFORMERS_VERSION"
-            branch: "master"
-          - repository: "donomii/go-rwkv.cpp"
-            variable: "RWKV_VERSION"
-            branch: "main"
           - repository: "ggerganov/whisper.cpp"
             variable: "WHISPER_CPP_VERSION"
             branch: "master"
-          - repository: "go-skynet/go-bert.cpp"
-            variable: "BERT_VERSION"
-            branch: "master"
-          - repository: "go-skynet/bloomz.cpp"
-            variable: "BLOOMZ_VERSION"
+          - repository: "PABannier/bark.cpp"
+            variable: "BARKCPP_VERSION"
             branch: "main"
-          - repository: "mudler/go-ggllm.cpp"
-            variable: "GOGGLLM_VERSION"
+          - repository: "leejet/stable-diffusion.cpp"
+            variable: "STABLEDIFFUSION_GGML_VERSION"
             branch: "master"
           - repository: "mudler/go-stable-diffusion"
             variable: "STABLEDIFFUSION_VERSION"
diff --git a/.gitignore b/.gitignore
index 9f31131f..d821c435 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 /sources/
 __pycache__/
 *.a
+*.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
diff --git a/Makefile b/Makefile
index d94b6bad..4226c5d7 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=3ad5451f3b75809e3033e4e577b9f60bcaf6676a
+CPPLLAMA_VERSION?=08ea539df211e46bb4d0dd275e541cb591d5ebc8
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -26,6 +26,14 @@ STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
 TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 
+# bark.cpp
+BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+BARKCPP_VERSION?=v1.0.0
+
+# stablediffusion.cpp (ggml)
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=9578fdcc4632dc3de5565f28e2fb16b7c18f8d48
+
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
@@ -201,6 +209,14 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
+
+ifeq ($(ONNX_OS),linux)
+ifeq ($(ONNX_ARCH),x64)
+	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
+	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
+endif
+endif
+
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
@@ -236,6 +252,23 @@ sources/go-llama.cpp:
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 
+## bark.cpp
+sources/bark.cpp:
+	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
+	cd sources/bark.cpp && \
+	git checkout $(BARKCPP_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/bark.cpp/build/libbark.a: sources/bark.cpp
+	cd sources/bark.cpp && \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release
+
+backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
+	$(MAKE) -C backend/go/bark libbark.a
+
 ## go-piper
 sources/go-piper:
 	mkdir -p sources/go-piper
@@ -249,7 +282,7 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
 
-## stable diffusion
+## stable diffusion (onnx)
 sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
@@ -262,6 +295,30 @@ sources/go-stable-diffusion:
 sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
 	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 
+## stablediffusion (ggml)
+sources/stablediffusion-ggml.cpp:
+	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+	cd sources/stablediffusion-ggml.cpp && \
+	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
+	cd sources/stablediffusion-ggml.cpp && \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release
+
+backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
+	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
+
+backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion-ggml
+endif
+
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
 	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
@@ -302,7 +359,7 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
 
-get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
@@ -343,7 +400,9 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
+	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -792,6 +851,13 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 
+backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/bark-cpp
+endif
+
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
diff --git a/README.md b/README.md
index 2fd89863..ef950bf1 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,8 @@ local-ai run oci://localai/phi-2:latest
 
 ## 📰 Latest project news
 
+- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
+- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
diff --git a/backend/backend.proto b/backend/backend.proto
index d6e8f236..0a341ca2 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -240,6 +240,11 @@ message ModelOptions {
 
   repeated string LoraAdapters = 60;
   repeated float LoraScales = 61;
+
+  repeated string Options = 62;
+
+  string CacheTypeKey = 63;
+  string CacheTypeValue = 64;
 }
 
 message Result {
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index 0fde74cb..98dd8fde 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -681,7 +681,6 @@ struct llama_server_context
         slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
         slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
         slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
         slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
         slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -1213,13 +1212,12 @@ struct llama_server_context
             {"mirostat",          slot.sparams.mirostat},
             {"mirostat_tau",      slot.sparams.mirostat_tau},
             {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
             {"stop",              slot.params.antiprompt},
             {"n_predict",         slot.params.n_predict},
             {"n_keep",            params.n_keep},
             {"ignore_eos",        slot.sparams.ignore_eos},
             {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+             //      {"logit_bias",        slot.sparams.logit_bias},
             {"n_probs",           slot.sparams.n_probs},
             {"min_keep",          slot.sparams.min_keep},
             {"grammar",           slot.sparams.grammar},
@@ -2112,7 +2110,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
     //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
     //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
     //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
     //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
     //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2135,7 +2132,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     data["mirostat"] = predict->mirostat();
     data["mirostat_tau"] = predict->mirostattau();
     data["mirostat_eta"] = predict->mirostateta();
-    data["penalize_nl"] = predict->penalizenl();
     data["n_keep"] = predict->nkeep();
     data["seed"] = predict->seed();
     data["grammar"] = predict->grammar();
@@ -2181,7 +2177,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
-//     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2228,6 +2223,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 static void params_parse(const backend::ModelOptions* request,
                                 common_params & params) {
    
@@ -2241,6 +2265,12 @@ static void params_parse(const backend::ModelOptions* request,
     }
     //  params.model_alias ??
     params.model_alias =  request->modelfile();
+    if (!request->cachetypekey().empty()) {
+        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
+    }
+    if (!request->cachetypevalue().empty()) {
+        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
+    }
     params.n_ctx = request->contextsize();
     //params.memory_f16 = request->f16memory();
     params.cpuparams.n_threads = request->threads();
diff --git a/backend/go/bark/Makefile b/backend/go/bark/Makefile
new file mode 100644
index 00000000..e8902615
--- /dev/null
+++ b/backend/go/bark/Makefile
@@ -0,0 +1,25 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+BUILD_TYPE?=
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+gobark.o:
+	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
+
+libbark.a: gobark.o
+	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
+	$(AR) rcs libbark.a gobark.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
+	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
+
+clean:
+	rm -f gobark.o libbark.a
\ No newline at end of file
diff --git a/backend/go/bark/gobark.cpp b/backend/go/bark/gobark.cpp
new file mode 100644
index 00000000..b5f414b8
--- /dev/null
+++ b/backend/go/bark/gobark.cpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <tuple>
+
+#include "bark.h"
+#include "gobark.h"
+#include "common.h"
+#include "ggml.h"
+
+struct bark_context *c;
+
+void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
+    if (step == bark_encoding_step::SEMANTIC) {
+        printf("\rGenerating semantic tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::COARSE) {
+        printf("\rGenerating coarse tokens... %d%%", progress);
+    } else if (step == bark_encoding_step::FINE) {
+        printf("\rGenerating fine tokens... %d%%", progress);
+    }
+    fflush(stdout);
+}
+
+int load_model(char *model) {
+    // initialize bark context
+    struct bark_context_params ctx_params = bark_context_default_params();
+    bark_params params;
+
+    params.model_path = model;
+
+   // ctx_params.verbosity = verbosity;
+    ctx_params.progress_callback = bark_print_progress_callback;
+    ctx_params.progress_callback_user_data = nullptr;
+
+    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        return 1;
+    }
+
+    c = bctx;
+
+    return 0;
+}
+
+int tts(char *text,int  threads, char *dst ) {
+
+    ggml_time_init();
+    const int64_t t_main_start_us = ggml_time_us();
+
+    // generate audio
+    if (!bark_generate_audio(c, text, threads)) {
+        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        return 1;
+    }
+
+    const float *audio_data = bark_get_audio_data(c);
+    if (audio_data == NULL) {
+        fprintf(stderr, "%s: Could not get audio data\n", __func__);
+        return 1;
+    }
+
+    const int audio_arr_size = bark_get_audio_data_size(c);
+
+    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
+
+    write_wav_on_disk(audio_arr, dst);
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_load_us = bark_get_load_time(c);
+        const int64_t t_eval_us = bark_get_eval_time(c);
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
+        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
+    }
+    
+    return 0;
+}
+
+int unload() {
+    bark_free(c);
+}
+
diff --git a/backend/go/bark/gobark.go b/backend/go/bark/gobark.go
new file mode 100644
index 00000000..133a4a39
--- /dev/null
+++ b/backend/go/bark/gobark.go
@@ -0,0 +1,52 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
+// #include <gobark.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Bark struct {
+	base.SingleThread
+	threads int
+}
+
+func (sd *Bark) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	ret := C.load_model(modelFile)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
+
+func (sd *Bark) TTS(opts *pb.TTSRequest) error {
+	t := C.CString(opts.Text)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	threads := C.int(sd.threads)
+
+	ret := C.tts(t, threads, dst)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
diff --git a/backend/go/bark/gobark.h b/backend/go/bark/gobark.h
new file mode 100644
index 00000000..06fb965d
--- /dev/null
+++ b/backend/go/bark/gobark.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model);
+int tts(char *text,int  threads, char *dst );
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/backend/go/bark/main.go b/backend/go/bark/main.go
new file mode 100644
index 00000000..840a687d
--- /dev/null
+++ b/backend/go/bark/main.go
@@ -0,0 +1,20 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/backend/go/image/stablediffusion-ggml/Makefile b/backend/go/image/stablediffusion-ggml/Makefile
new file mode 100644
index 00000000..cca9bf6e
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -0,0 +1,21 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+BUILD_TYPE?=
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+gosd.o:
+	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
+
+libsd.a: gosd.o
+	cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
+	$(AR) rcs libsd.a gosd.o
+
+clean:
+	rm -f gosd.o libsd.a
\ No newline at end of file
diff --git a/backend/go/image/stablediffusion-ggml/gosd.cpp b/backend/go/image/stablediffusion-ggml/gosd.cpp
new file mode 100644
index 00000000..8653aa1e
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -0,0 +1,228 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include "gosd.h"
+
+// #include "preprocessing.hpp"
+#include "flux.hpp"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+sd_ctx_t* sd_c;
+
+sample_method_t sample_method;
+
+int load_model(char *model, char* options[], int threads, int diff) {
+    fprintf (stderr, "Loading model!\n");
+
+    char *stableDiffusionModel = "";
+    if (diff == 1 ) {
+        stableDiffusionModel = model;
+        model = "";
+    }
+
+    // decode options. Options are in form optname:optvale, or if booleans only optname.
+    char *clip_l_path  = "";
+    char *clip_g_path  = "";
+    char *t5xxl_path  = "";
+    char *vae_path  = "";
+    char *scheduler = "";
+    char *sampler = "";
+
+    // If options is not NULL, parse options
+    for (int i = 0; options[i] != NULL; i++) {
+        char *optname = strtok(options[i], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "clip_l_path")) {
+            clip_l_path = optval;
+        }
+        if (!strcmp(optname, "clip_g_path")) {
+            clip_g_path = optval;
+        }
+        if (!strcmp(optname, "t5xxl_path")) {
+            t5xxl_path = optval;
+        }
+        if (!strcmp(optname, "vae_path")) {
+            vae_path = optval;
+        }
+        if (!strcmp(optname, "scheduler")) {
+            scheduler = optval;
+        }
+        if (!strcmp(optname, "sampler")) {
+            sampler = optval;
+        }
+    }
+
+    int sample_method_found = -1;
+    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+        if (!strcmp(sampler, sample_method_str[m])) {
+            sample_method_found = m;
+        }
+    }
+    if (sample_method_found == -1) {
+        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
+        sample_method_found = EULER_A;
+    }
+    sample_method = (sample_method_t)sample_method_found;
+
+    int schedule_found            = -1;
+    for (int d = 0; d < N_SCHEDULES; d++) {
+        if (!strcmp(scheduler, schedule_str[d])) {
+            schedule_found = d;
+                fprintf (stderr, "Found scheduler: %s\n", scheduler);
+
+        }
+    }
+
+    if (schedule_found == -1) {
+        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
+        schedule_found = DEFAULT;
+    }
+
+    schedule_t schedule = (schedule_t)schedule_found;
+    
+    fprintf (stderr, "Creating context\n");
+    sd_ctx_t* sd_ctx = new_sd_ctx(model,
+                                  clip_l_path,
+                                  clip_g_path,
+                                  t5xxl_path,
+                                  stableDiffusionModel,
+                                  vae_path,
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  "",
+                                  false,
+                                  false,
+                                  false,
+                                  threads,
+                                  SD_TYPE_COUNT,
+                                  STD_DEFAULT_RNG,
+                                  schedule,
+                                  false,
+                                  false,
+                                  false,
+                                  false);
+
+    if (sd_ctx == NULL) {
+        fprintf (stderr, "failed loading model (generic error)\n");
+        return 1;
+    }
+    fprintf (stderr, "Created context: OK\n");
+
+    sd_c = sd_ctx;
+
+    return 0;
+}
+
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
+
+    sd_image_t* results;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+
+    fprintf (stderr, "Generating image\n");
+
+    results = txt2img(sd_c,
+                            text,
+                            negativeText,
+                            -1, //clip_skip
+                            cfg_scale, // sfg_scale
+                            3.5f,
+                            width,
+                            height,
+                            sample_method, 
+                            steps,
+                            seed,
+                            1,
+                            NULL,
+                            0.9f,
+                            20.f,
+                            false,
+                            "",
+                            skip_layers.data(),
+                            skip_layers.size(),
+                            0,
+                            0.01,
+                            0.2);
+
+    if (results == NULL) {
+        fprintf (stderr, "NO results\n");
+        return 1;
+    }
+
+    if (results[0].data == NULL) {
+        fprintf (stderr, "Results with no data\n");
+        return 1;
+    }
+
+    fprintf (stderr, "Writing PNG\n");
+
+    fprintf (stderr, "DST: %s\n", dst);
+    fprintf (stderr, "Width: %d\n", results[0].width);
+    fprintf (stderr, "Height: %d\n", results[0].height);
+    fprintf (stderr, "Channel: %d\n", results[0].channel);
+    fprintf (stderr, "Data: %p\n", results[0].data);
+
+    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
+                       results[0].data, 0, NULL);
+    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+
+    // TODO: free results. Why does it crash?
+
+    free(results[0].data);
+    results[0].data = NULL;
+    free(results);
+    fprintf (stderr, "gen_image is done", dst);
+
+    return 0;
+}
+
+int unload() {
+    free_sd_ctx(sd_c);
+}
+
diff --git a/backend/go/image/stablediffusion-ggml/gosd.go b/backend/go/image/stablediffusion-ggml/gosd.go
new file mode 100644
index 00000000..29d0033d
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -0,0 +1,96 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
+// #include <gosd.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/utils"
+)
+
+type SDGGML struct {
+	base.SingleThread
+	threads      int
+	sampleMethod string
+	cfgScale     float32
+}
+
+func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	var options **C.char
+	// prepare the options array to pass to C
+
+	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
+	length := C.size_t(len(opts.Options))
+	options = (**C.char)(C.malloc(length * size))
+	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
+
+	var diffusionModel int
+
+	var oo []string
+	for _, op := range opts.Options {
+		if op == "diffusion_model" {
+			diffusionModel = 1
+			continue
+		}
+
+		// If it's an option path, we resolve absolute path from the model path
+		if strings.Contains(op, ":") && strings.Contains(op, "path") {
+			data := strings.Split(op, ":")
+			data[1] = filepath.Join(opts.ModelPath, data[1])
+			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
+				oo = append(oo, strings.Join(data, ":"))
+			}
+		} else {
+			oo = append(oo, op)
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
+
+	for i, x := range oo {
+		view[i] = C.CString(x)
+	}
+
+	sd.cfgScale = opts.CFGScale
+
+	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
+	if ret != 0 {
+		return fmt.Errorf("could not load model")
+	}
+
+	return nil
+}
+
+func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
+	t := C.CString(opts.PositivePrompt)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	negative := C.CString(opts.NegativePrompt)
+	defer C.free(unsafe.Pointer(negative))
+
+	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
diff --git a/backend/go/image/stablediffusion-ggml/gosd.h b/backend/go/image/stablediffusion-ggml/gosd.h
new file mode 100644
index 00000000..5297e871
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model, char* options[], int threads, int diffusionModel);
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/backend/go/image/stablediffusion-ggml/main.go b/backend/go/image/stablediffusion-ggml/main.go
new file mode 100644
index 00000000..acee74fa
--- /dev/null
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -0,0 +1,20 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+		panic(err)
+	}
+}
diff --git a/backend/python/autogptq/requirements-intel.txt b/backend/python/autogptq/requirements-intel.txt
index d5e0173e..cec8bff4 100644
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/autogptq/requirements.txt b/backend/python/autogptq/requirements.txt
index 2e9d5ad6..22408f47 100644
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 transformers
\ No newline at end of file
diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt
index c0e4dcaa..1f043bbf 100644
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/bark/requirements.txt b/backend/python/bark/requirements.txt
index 1f9e2b35..3fca1de5 100644
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index 934b1fd3..6013cf76 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,6 +17,9 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+
+PYTHON_VERSION="3.10"
+
 function init() {
     # Name of the backend (directory name)
     BACKEND_NAME=${PWD##*/}
@@ -88,7 +91,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
     if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${EDIR}/venv
+        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
         echo "virtualenv created"
     fi
 
diff --git a/backend/python/common/template/requirements.txt b/backend/python/common/template/requirements.txt
index e4e07678..893dc812 100644
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 grpcio-tools
\ No newline at end of file
diff --git a/backend/python/coqui/requirements-intel.txt b/backend/python/coqui/requirements-intel.txt
index de3b4ee4..7ed2fb42 100644
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
 coqui-tts
\ No newline at end of file
diff --git a/backend/python/coqui/requirements.txt b/backend/python/coqui/requirements.txt
index d3313cc6..57638588 100644
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 packaging==24.1
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-intel.txt b/backend/python/diffusers/requirements-intel.txt
index 566278a8..bd6632bf 100644
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 diffusers
 opencv-python
 transformers
diff --git a/backend/python/diffusers/requirements.txt b/backend/python/diffusers/requirements.txt
index 4c3e703d..71832ead 100644
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.68.0
+grpcio==1.68.1
 pillow
 protobuf
 certifi
diff --git a/backend/python/exllama2/requirements.txt b/backend/python/exllama2/requirements.txt
index 3a74e6e8..408eb318 100644
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 wheel
diff --git a/backend/python/mamba/requirements.txt b/backend/python/mamba/requirements.txt
index 99715d67..8e4eabf1 100644
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/openvoice/requirements-intel.txt b/backend/python/openvoice/requirements-intel.txt
index d38351b1..7908a889 100644
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 librosa==0.9.1
 faster-whisper==0.9.0
diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
index 57557c88..e6a1e5a5 100644
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 librosa
 faster-whisper
@@ -18,3 +18,4 @@ jieba==0.42.1
 gradio==3.48.0
 langid==1.1.6
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements-intel.txt b/backend/python/parler-tts/requirements-intel.txt
index c0e4dcaa..bcb8900e 100644
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,5 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
index 1555a2cb..faf4ea3d 100644
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,3 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 certifi
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/rerankers/requirements-intel.txt b/backend/python/rerankers/requirements-intel.txt
index e6bb4cc7..a3cc600c 100644
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/rerankers/requirements.txt b/backend/python/rerankers/requirements.txt
index 99715d67..8e4eabf1 100644
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements-intel.txt b/backend/python/sentencetransformers/requirements-intel.txt
index 56e17446..23e0d5f2 100644
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 accelerate
 sentence-transformers==3.3.1
 transformers
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements.txt b/backend/python/sentencetransformers/requirements.txt
index b39ef126..b9dacf9b 100644
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 datasets
diff --git a/backend/python/transformers-musicgen/requirements-intel.txt b/backend/python/transformers-musicgen/requirements-intel.txt
index 608d6939..bb191163 100644
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/transformers-musicgen/requirements.txt b/backend/python/transformers-musicgen/requirements.txt
index 0a234d10..2e46b08f 100644
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 scipy==1.14.0
 certifi
\ No newline at end of file
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
index 450071cf..d981fd99 100644
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements-intel.txt b/backend/python/vall-e-x/requirements-intel.txt
index adbabeac..284e7131 100644
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -3,5 +3,4 @@ intel-extension-for-pytorch
 accelerate
 torch
 torchaudio
-optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+optimum[openvino]
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
index 99715d67..d981fd99 100644
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
-certifi
\ No newline at end of file
+certifi
+setuptools
\ No newline at end of file
diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
index 98e2e57e..0183a928 100755
--- a/backend/python/vllm/install.sh
+++ b/backend/python/vllm/install.sh
@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
             git clone https://github.com/vllm-project/vllm
         fi
         pushd vllm
-            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.0 protobuf bitsandbytes
+            uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
             uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
             VLLM_TARGET_DEVICE=cpu python setup.py install
         popd
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
index 95443368..36326f95 100644
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,5 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 bitsandbytes
\ No newline at end of file
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
index b5872310..d981fd99 100644
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.68.0
+grpcio==1.68.1
 protobuf
 certifi
 setuptools
\ No newline at end of file
diff --git a/core/application.go b/core/application.go
deleted file mode 100644
index e4efbdd0..00000000
--- a/core/application.go
+++ /dev/null
@@ -1,38 +0,0 @@
-package core
-
-import (
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/model"
-)
-
-// The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
-// Perhaps a proper DI system is worth it in the future, but for now keep things simple.
-type Application struct {
-
-	// Application-Level Config
-	ApplicationConfig *config.ApplicationConfig
-	// ApplicationState *ApplicationState
-
-	// Core Low-Level Services
-	BackendConfigLoader *config.BackendConfigLoader
-	ModelLoader         *model.ModelLoader
-
-	// Backend Services
-	// EmbeddingsBackendService      *backend.EmbeddingsBackendService
-	// ImageGenerationBackendService *backend.ImageGenerationBackendService
-	// LLMBackendService             *backend.LLMBackendService
-	// TranscriptionBackendService *backend.TranscriptionBackendService
-	// TextToSpeechBackendService  *backend.TextToSpeechBackendService
-
-	// LocalAI System Services
-	BackendMonitorService *services.BackendMonitorService
-	GalleryService        *services.GalleryService
-	LocalAIMetricsService *services.LocalAIMetricsService
-	// OpenAIService         *services.OpenAIService
-}
-
-// TODO [NEXT PR?]: Break up ApplicationConfig.
-// Migrate over stuff that is not set via config at all - especially runtime stuff
-type ApplicationState struct {
-}
diff --git a/core/application/application.go b/core/application/application.go
new file mode 100644
index 00000000..6e8d6204
--- /dev/null
+++ b/core/application/application.go
@@ -0,0 +1,39 @@
+package application
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
+)
+
+type Application struct {
+	backendLoader      *config.BackendConfigLoader
+	modelLoader        *model.ModelLoader
+	applicationConfig  *config.ApplicationConfig
+	templatesEvaluator *templates.Evaluator
+}
+
+func newApplication(appConfig *config.ApplicationConfig) *Application {
+	return &Application{
+		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		applicationConfig:  appConfig,
+		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
+	}
+}
+
+func (a *Application) BackendLoader() *config.BackendConfigLoader {
+	return a.backendLoader
+}
+
+func (a *Application) ModelLoader() *model.ModelLoader {
+	return a.modelLoader
+}
+
+func (a *Application) ApplicationConfig() *config.ApplicationConfig {
+	return a.applicationConfig
+}
+
+func (a *Application) TemplatesEvaluator() *templates.Evaluator {
+	return a.templatesEvaluator
+}
diff --git a/core/startup/config_file_watcher.go b/core/application/config_file_watcher.go
similarity index 96%
rename from core/startup/config_file_watcher.go
rename to core/application/config_file_watcher.go
index df72483f..46f29b10 100644
--- a/core/startup/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -1,4 +1,4 @@
-package startup
+package application
 
 import (
 	"encoding/json"
@@ -8,8 +8,8 @@ import (
 	"path/filepath"
 	"time"
 
-	"github.com/fsnotify/fsnotify"
 	"dario.cat/mergo"
+	"github.com/fsnotify/fsnotify"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/rs/zerolog/log"
 )
diff --git a/core/startup/startup.go b/core/application/startup.go
similarity index 62%
rename from core/startup/startup.go
rename to core/application/startup.go
index 0eb5fa58..cd52d37a 100644
--- a/core/startup/startup.go
+++ b/core/application/startup.go
@@ -1,15 +1,15 @@
-package startup
+package application
 
 import (
 	"fmt"
 	"os"
 
-	"github.com/mudler/LocalAI/core"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/assets"
+
 	"github.com/mudler/LocalAI/pkg/library"
 	"github.com/mudler/LocalAI/pkg/model"
 	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
@@ -17,8 +17,9 @@ import (
 	"github.com/rs/zerolog/log"
 )
 
-func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.ModelLoader, *config.ApplicationConfig, error) {
+func New(opts ...config.AppOption) (*Application, error) {
 	options := config.NewApplicationConfig(opts...)
+	application := newApplication(options)
 
 	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.ModelPath)
 	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
@@ -36,28 +37,28 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	// Make sure directories exists
 	if options.ModelPath == "" {
-		return nil, nil, nil, fmt.Errorf("options.ModelPath cannot be empty")
+		return nil, fmt.Errorf("options.ModelPath cannot be empty")
 	}
 	err = os.MkdirAll(options.ModelPath, 0750)
 	if err != nil {
-		return nil, nil, nil, fmt.Errorf("unable to create ModelPath: %q", err)
+		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
 	if options.ImageDir != "" {
 		err := os.MkdirAll(options.ImageDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create ImageDir: %q", err)
+			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.AudioDir != "" {
 		err := os.MkdirAll(options.AudioDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create AudioDir: %q", err)
+			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
 		err := os.MkdirAll(options.UploadDir, 0750)
 		if err != nil {
-			return nil, nil, nil, fmt.Errorf("unable to create UploadDir: %q", err)
+			return nil, fmt.Errorf("unable to create UploadDir: %q", err)
 		}
 	}
 
@@ -65,39 +66,36 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 		log.Error().Err(err).Msg("error installing models")
 	}
 
-	cl := config.NewBackendConfigLoader(options.ModelPath)
-	ml := model.NewModelLoader(options.ModelPath)
-
 	configLoaderOpts := options.ToConfigLoaderOptions()
 
-	if err := cl.LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
+	if err := application.BackendLoader().LoadBackendConfigsFromPath(options.ModelPath, configLoaderOpts...); err != nil {
 		log.Error().Err(err).Msg("error loading config files")
 	}
 
 	if options.ConfigFile != "" {
-		if err := cl.LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
+		if err := application.BackendLoader().LoadMultipleBackendConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
 			log.Error().Err(err).Msg("error loading config file")
 		}
 	}
 
-	if err := cl.Preload(options.ModelPath); err != nil {
+	if err := application.BackendLoader().Preload(options.ModelPath); err != nil {
 		log.Error().Err(err).Msg("error downloading models")
 	}
 
 	if options.PreloadJSONModels != "" {
 		if err := services.ApplyGalleryFromString(options.ModelPath, options.PreloadJSONModels, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
+			return nil, err
 		}
 	}
 
 	if options.PreloadModelsFromPath != "" {
 		if err := services.ApplyGalleryFromFile(options.ModelPath, options.PreloadModelsFromPath, options.EnforcePredownloadScans, options.Galleries); err != nil {
-			return nil, nil, nil, err
+			return nil, err
 		}
 	}
 
 	if options.Debug {
-		for _, v := range cl.GetAllBackendConfigs() {
+		for _, v := range application.BackendLoader().GetAllBackendConfigs() {
 			log.Debug().Msgf("Model: %s (config: %+v)", v.Name, v)
 		}
 	}
@@ -123,7 +121,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	go func() {
 		<-options.Context.Done()
 		log.Debug().Msgf("Context canceled, shutting down")
-		err := ml.StopAllGRPC()
+		err := application.ModelLoader().StopAllGRPC()
 		if err != nil {
 			log.Error().Err(err).Msg("error while stopping all grpc backends")
 		}
@@ -131,12 +129,12 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	if options.WatchDog {
 		wd := model.NewWatchDog(
-			ml,
+			application.ModelLoader(),
 			options.WatchDogBusyTimeout,
 			options.WatchDogIdleTimeout,
 			options.WatchDogBusy,
 			options.WatchDogIdle)
-		ml.SetWatchDog(wd)
+		application.ModelLoader().SetWatchDog(wd)
 		go wd.Run()
 		go func() {
 			<-options.Context.Done()
@@ -147,7 +145,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 
 	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
-			cfg, err := cl.LoadBackendConfigFileByName(m, options.ModelPath,
+			cfg, err := application.BackendLoader().LoadBackendConfigFileByName(m, options.ModelPath,
 				config.LoadOptionDebug(options.Debug),
 				config.LoadOptionThreads(options.Threads),
 				config.LoadOptionContextSize(options.ContextSize),
@@ -155,7 +153,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 				config.ModelPath(options.ModelPath),
 			)
 			if err != nil {
-				return nil, nil, nil, err
+				return nil, err
 			}
 
 			log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
@@ -163,9 +161,9 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 			o := backend.ModelOptions(*cfg, options)
 
 			var backendErr error
-			_, backendErr = ml.Load(o...)
+			_, backendErr = application.ModelLoader().Load(o...)
 			if backendErr != nil {
-				return nil, nil, nil, err
+				return nil, err
 			}
 		}
 	}
@@ -174,7 +172,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
 	startWatcher(options)
 
 	log.Info().Msg("core/startup process completed!")
-	return cl, ml, options, nil
+	return application, nil
 }
 
 func startWatcher(options *config.ApplicationConfig) {
@@ -201,32 +199,3 @@ func startWatcher(options *config.ApplicationConfig) {
 		log.Error().Err(err).Msg("failed creating watcher")
 	}
 }
-
-// In Lieu of a proper DI framework, this function wires up the Application manually.
-// This is in core/startup rather than core/state.go to keep package references clean!
-func createApplication(appConfig *config.ApplicationConfig) *core.Application {
-	app := &core.Application{
-		ApplicationConfig:   appConfig,
-		BackendConfigLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
-		ModelLoader:         model.NewModelLoader(appConfig.ModelPath),
-	}
-
-	var err error
-
-	// app.EmbeddingsBackendService = backend.NewEmbeddingsBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.ImageGenerationBackendService = backend.NewImageGenerationBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.LLMBackendService = backend.NewLLMBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TranscriptionBackendService = backend.NewTranscriptionBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	// app.TextToSpeechBackendService = backend.NewTextToSpeechBackendService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-
-	app.BackendMonitorService = services.NewBackendMonitorService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig)
-	app.GalleryService = services.NewGalleryService(app.ApplicationConfig)
-	// app.OpenAIService = services.NewOpenAIService(app.ModelLoader, app.BackendConfigLoader, app.ApplicationConfig, app.LLMBackendService)
-
-	app.LocalAIMetricsService, err = services.NewLocalAIMetricsService()
-	if err != nil {
-		log.Error().Err(err).Msg("encountered an error initializing metrics service, startup will continue but metrics will not be tracked.")
-	}
-
-	return app
-}
diff --git a/core/backend/options.go b/core/backend/options.go
index c6591222..f6247c60 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -122,7 +122,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		PipelineType:         c.Diffusers.PipelineType,
-		CFGScale:             c.Diffusers.CFGScale,
+		CFGScale:             c.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
 		LoraScale:            c.LoraScale,
 		LoraAdapters:         c.LoraAdapters,
@@ -132,6 +132,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		IMG2IMG:              c.Diffusers.IMG2IMG,
 		CLIPModel:            c.Diffusers.ClipModel,
 		CLIPSubfolder:        c.Diffusers.ClipSubFolder,
+		Options:              c.Options,
 		CLIPSkip:             int32(c.Diffusers.ClipSkip),
 		ControlNet:           c.Diffusers.ControlNet,
 		ContextSize:          int32(ctxSize),
@@ -150,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		TensorParallelSize:   int32(c.TensorParallelSize),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
+		CacheTypeKey:         c.CacheTypeK,
+		CacheTypeValue:       c.CacheTypeV,
 		NoKVOffload:          c.NoKVOffloading,
 		YarnExtFactor:        c.YarnExtFactor,
 		YarnAttnFactor:       c.YarnAttnFactor,
diff --git a/core/cli/run.go b/core/cli/run.go
index b2d439a0..a0e16155 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -6,12 +6,12 @@ import (
 	"strings"
 	"time"
 
+	"github.com/mudler/LocalAI/core/application"
 	cli_api "github.com/mudler/LocalAI/core/cli/api"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/p2p"
-	"github.com/mudler/LocalAI/core/startup"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -186,16 +186,16 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	}
 
 	if r.PreloadBackendOnly {
-		_, _, _, err := startup.Startup(opts...)
+		_, err := application.New(opts...)
 		return err
 	}
 
-	cl, ml, options, err := startup.Startup(opts...)
+	app, err := application.New(opts...)
 	if err != nil {
 		return fmt.Errorf("failed basic startup tasks with error %s", err.Error())
 	}
 
-	appHTTP, err := http.App(cl, ml, options)
+	appHTTP, err := http.API(app)
 	if err != nil {
 		log.Error().Err(err).Msg("error during HTTP App construction")
 		return err
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 998f22a3..f07ec3d3 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -72,6 +72,8 @@ type BackendConfig struct {
 
 	Description string `yaml:"description"`
 	Usage       string `yaml:"usage"`
+
+	Options []string `yaml:"options"`
 }
 
 type File struct {
@@ -97,16 +99,15 @@ type GRPC struct {
 }
 
 type Diffusers struct {
-	CUDA             bool    `yaml:"cuda"`
-	PipelineType     string  `yaml:"pipeline_type"`
-	SchedulerType    string  `yaml:"scheduler_type"`
-	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
-	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
-	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
-	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
-	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
-	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
-	ControlNet       string  `yaml:"control_net"`
+	CUDA             bool   `yaml:"cuda"`
+	PipelineType     string `yaml:"pipeline_type"`
+	SchedulerType    string `yaml:"scheduler_type"`
+	EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	IMG2IMG          bool   `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int    `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+	ControlNet       string `yaml:"control_net"`
 }
 
 // LLMConfig is a struct that holds the configuration that are
@@ -154,8 +155,10 @@ type LLMConfig struct {
 	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
 	MMProj               string    `yaml:"mmproj"`
 
-	FlashAttention bool `yaml:"flash_attention"`
-	NoKVOffloading bool `yaml:"no_kv_offloading"`
+	FlashAttention bool   `yaml:"flash_attention"`
+	NoKVOffloading bool   `yaml:"no_kv_offloading"`
+	CacheTypeK     string `yaml:"cache_type_k"`
+	CacheTypeV     string `yaml:"cache_type_v"`
 
 	RopeScaling string `yaml:"rope_scaling"`
 	ModelType   string `yaml:"type"`
@@ -164,6 +167,8 @@ type LLMConfig struct {
 	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
 	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
+
+	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
 
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
@@ -201,6 +206,8 @@ type TemplateConfig struct {
 	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 
 	Multimodal string `yaml:"multimodal"`
+
+	JinjaTemplate bool `yaml:"jinja_template"`
 }
 
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
diff --git a/core/config/guesser.go b/core/config/guesser.go
index b63dd051..f5627461 100644
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -26,14 +26,14 @@ const (
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
-	RepeatPenalty float64
+	RepeatPenalty  float64
 }
 
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
-		StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
@@ -200,6 +200,18 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
 }
 
 func identifyFamily(f *gguf.GGUFFile) familyType {
diff --git a/core/http/app.go b/core/http/app.go
index 2ba2c2b9..a2d8b87a 100644
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -14,10 +14,9 @@ import (
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
 
-	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
-	"github.com/mudler/LocalAI/pkg/model"
 
 	"github.com/gofiber/contrib/fiberzerolog"
 	"github.com/gofiber/fiber/v2"
@@ -49,18 +48,18 @@ var embedDirStatic embed.FS
 // @in header
 // @name Authorization
 
-func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
+func API(application *application.Application) (*fiber.App, error) {
 
 	fiberCfg := fiber.Config{
 		Views:     renderEngine(),
-		BodyLimit: appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		BodyLimit: application.ApplicationConfig().UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
 		// We disable the Fiber startup message as it does not conform to structured logging.
 		// We register a startup log line with connection information in the OnListen hook to keep things user friendly though
 		DisableStartupMessage: true,
 		// Override default error handler
 	}
 
-	if !appConfig.OpaqueErrors {
+	if !application.ApplicationConfig().OpaqueErrors {
 		// Normally, return errors as JSON responses
 		fiberCfg.ErrorHandler = func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -86,9 +85,9 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		}
 	}
 
-	app := fiber.New(fiberCfg)
+	router := fiber.New(fiberCfg)
 
-	app.Hooks().OnListen(func(listenData fiber.ListenData) error {
+	router.Hooks().OnListen(func(listenData fiber.ListenData) error {
 		scheme := "http"
 		if listenData.TLS {
 			scheme = "https"
@@ -99,82 +98,82 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 
 	// Have Fiber use zerolog like the rest of the application rather than it's built-in logger
 	logger := log.Logger
-	app.Use(fiberzerolog.New(fiberzerolog.Config{
+	router.Use(fiberzerolog.New(fiberzerolog.Config{
 		Logger: &logger,
 	}))
 
 	// Default middleware config
 
-	if !appConfig.Debug {
-		app.Use(recover.New())
+	if !application.ApplicationConfig().Debug {
+		router.Use(recover.New())
 	}
 
-	if !appConfig.DisableMetrics {
+	if !application.ApplicationConfig().DisableMetrics {
 		metricsService, err := services.NewLocalAIMetricsService()
 		if err != nil {
 			return nil, err
 		}
 
 		if metricsService != nil {
-			app.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
-			app.Hooks().OnShutdown(func() error {
+			router.Use(localai.LocalAIMetricsAPIMiddleware(metricsService))
+			router.Hooks().OnShutdown(func() error {
 				return metricsService.Shutdown()
 			})
 		}
 
 	}
 	// Health Checks should always be exempt from auth, so register these first
-	routes.HealthRoutes(app)
+	routes.HealthRoutes(router)
 
-	kaConfig, err := middleware.GetKeyAuthConfig(appConfig)
+	kaConfig, err := middleware.GetKeyAuthConfig(application.ApplicationConfig())
 	if err != nil || kaConfig == nil {
 		return nil, fmt.Errorf("failed to create key auth config: %w", err)
 	}
 
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
-	app.Use(v2keyauth.New(*kaConfig))
+	router.Use(v2keyauth.New(*kaConfig))
 
-	if appConfig.CORS {
+	if application.ApplicationConfig().CORS {
 		var c func(ctx *fiber.Ctx) error
-		if appConfig.CORSAllowOrigins == "" {
+		if application.ApplicationConfig().CORSAllowOrigins == "" {
 			c = cors.New()
 		} else {
-			c = cors.New(cors.Config{AllowOrigins: appConfig.CORSAllowOrigins})
+			c = cors.New(cors.Config{AllowOrigins: application.ApplicationConfig().CORSAllowOrigins})
 		}
 
-		app.Use(c)
+		router.Use(c)
 	}
 
-	if appConfig.CSRF {
+	if application.ApplicationConfig().CSRF {
 		log.Debug().Msg("Enabling CSRF middleware. Tokens are now required for state-modifying requests")
-		app.Use(csrf.New())
+		router.Use(csrf.New())
 	}
 
 	// Load config jsons
-	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
-	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
+	utils.LoadConfig(application.ApplicationConfig().UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
+	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
+	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
 
-	galleryService := services.NewGalleryService(appConfig)
-	galleryService.Start(appConfig.Context, cl)
+	galleryService := services.NewGalleryService(application.ApplicationConfig())
+	galleryService.Start(application.ApplicationConfig().Context, application.BackendLoader())
 
-	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig)
-	routes.RegisterLocalAIRoutes(app, cl, ml, appConfig, galleryService)
-	routes.RegisterOpenAIRoutes(app, cl, ml, appConfig)
-	if !appConfig.DisableWebUI {
-		routes.RegisterUIRoutes(app, cl, ml, appConfig, galleryService)
+	routes.RegisterElevenLabsRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
+	routes.RegisterLocalAIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
+	routes.RegisterOpenAIRoutes(router, application)
+	if !application.ApplicationConfig().DisableWebUI {
+		routes.RegisterUIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
 	}
-	routes.RegisterJINARoutes(app, cl, ml, appConfig)
+	routes.RegisterJINARoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
 
 	httpFS := http.FS(embedDirStatic)
 
-	app.Use(favicon.New(favicon.Config{
+	router.Use(favicon.New(favicon.Config{
 		URL:        "/favicon.ico",
 		FileSystem: httpFS,
 		File:       "static/favicon.ico",
 	}))
 
-	app.Use("/static", filesystem.New(filesystem.Config{
+	router.Use("/static", filesystem.New(filesystem.Config{
 		Root:       httpFS,
 		PathPrefix: "static",
 		Browse:     true,
@@ -182,7 +181,7 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 
 	// Define a custom 404 handler
 	// Note: keep this at the bottom!
-	app.Use(notFoundHandler)
+	router.Use(notFoundHandler)
 
-	return app, nil
+	return router, nil
 }
diff --git a/core/http/app_test.go b/core/http/app_test.go
index 28ed0ab9..7c57ba21 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -5,24 +5,21 @@ import (
 	"context"
 	"embed"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
-	"strings"
 
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/core/http"
 	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/startup"
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
-	"github.com/mudler/LocalAI/pkg/model"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"gopkg.in/yaml.v3"
@@ -254,9 +251,6 @@ var _ = Describe("API test", func() {
 	var cancel context.CancelFunc
 	var tmpdir string
 	var modelDir string
-	var bcl *config.BackendConfigLoader
-	var ml *model.ModelLoader
-	var applicationConfig *config.ApplicationConfig
 
 	commonOpts := []config.AppOption{
 		config.WithDebug(true),
@@ -302,7 +296,7 @@ var _ = Describe("API test", func() {
 				},
 			}
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithGalleries(galleries),
@@ -312,7 +306,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(backendAssetsDir))...)
 			Expect(err).ToNot(HaveOccurred())
 
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
@@ -541,7 +535,7 @@ var _ = Describe("API test", func() {
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
+				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 
@@ -643,7 +637,7 @@ var _ = Describe("API test", func() {
 				},
 			}
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithAudioDir(tmpdir),
@@ -654,7 +648,7 @@ var _ = Describe("API test", func() {
 					config.WithBackendAssetsOutput(tmpdir))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
@@ -710,7 +704,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
 
 			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
-			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+			Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave")))
 		})
 		It("installs and is capable to generate images", Label("stablediffusion"), func() {
 			if runtime.GOOS != "linux" {
@@ -774,14 +768,14 @@ var _ = Describe("API test", func() {
 
 			var err error
 
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")
 
@@ -913,71 +907,6 @@ var _ = Describe("API test", func() {
 			})
 		})
 
-		Context("backends", func() {
-			It("runs rwkv completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
-
-				stream, err := client.CreateCompletionStream(context.TODO(), openai.CompletionRequest{
-					Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,", Stream: true,
-				})
-				Expect(err).ToNot(HaveOccurred())
-				defer stream.Close()
-
-				tokens := 0
-				text := ""
-				for {
-					response, err := stream.Recv()
-					if errors.Is(err, io.EOF) {
-						break
-					}
-
-					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Text
-					tokens++
-				}
-				Expect(text).ToNot(BeEmpty())
-				Expect(text).To(ContainSubstring("five"))
-				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
-			})
-			It("runs rwkv chat completion", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				resp, err := client.CreateChatCompletion(context.TODO(),
-					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five"), ContainSubstring("5")))
-
-				stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
-				Expect(err).ToNot(HaveOccurred())
-				defer stream.Close()
-
-				tokens := 0
-				text := ""
-				for {
-					response, err := stream.Recv()
-					if errors.Is(err, io.EOF) {
-						break
-					}
-
-					Expect(err).ToNot(HaveOccurred())
-					text += response.Choices[0].Delta.Content
-					tokens++
-				}
-				Expect(text).ToNot(BeEmpty())
-				Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
-
-				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
-			})
-		})
-
 		// See tests/integration/stores_test
 		Context("Stores", Label("stores"), func() {
 
@@ -1057,14 +986,14 @@ var _ = Describe("API test", func() {
 			c, cancel = context.WithCancel(context.Background())
 
 			var err error
-			bcl, ml, applicationConfig, err = startup.Startup(
+			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithModelPath(modelPath),
 					config.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
 			)
 			Expect(err).ToNot(HaveOccurred())
-			app, err = App(bcl, ml, applicationConfig)
+			app, err = API(application)
 			Expect(err).ToNot(HaveOccurred())
 
 			go app.Listen("127.0.0.1:9090")
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 1ac1387e..c2b201bd 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -14,6 +14,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/templates"
+
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
@@ -24,7 +26,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
-func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	var id, textContentToReturn string
 	var created int
 
@@ -294,148 +296,10 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		// If we are using the tokenizer template, we don't need to process the messages
 		// unless we are processing functions
 		if !config.TemplateConfig.UseTokenizerTemplate || shouldUseFn {
-			suppressConfigSystemPrompt := false
-			mess := []string{}
-			for messageIndex, i := range input.Messages {
-				var content string
-				role := i.Role
-
-				// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
-				// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
-				if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
-					roleFn := "assistant_function_call"
-					r := config.Roles[roleFn]
-					if r != "" {
-						role = roleFn
-					}
-				}
-				r := config.Roles[role]
-				contentExists := i.Content != nil && i.StringContent != ""
-
-				fcall := i.FunctionCall
-				if len(i.ToolCalls) > 0 {
-					fcall = i.ToolCalls
-				}
-
-				// First attempt to populate content via a chat message specific template
-				if config.TemplateConfig.ChatMessage != "" {
-					chatMessageData := model.ChatMessageTemplateData{
-						SystemPrompt: config.SystemPrompt,
-						Role:         r,
-						RoleName:     role,
-						Content:      i.StringContent,
-						FunctionCall: fcall,
-						FunctionName: i.Name,
-						LastMessage:  messageIndex == (len(input.Messages) - 1),
-						Function:     config.Grammar != "" && (messageIndex == (len(input.Messages) - 1)),
-						MessageIndex: messageIndex,
-					}
-					templatedChatMessage, err := ml.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
-					if err != nil {
-						log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
-					} else {
-						if templatedChatMessage == "" {
-							log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
-							continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
-						}
-						log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
-						content = templatedChatMessage
-					}
-				}
-
-				marshalAnyRole := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + fmt.Sprint(r, " ", string(j))
-						} else {
-							content = fmt.Sprint(r, " ", string(j))
-						}
-					}
-				}
-				marshalAny := func(f any) {
-					j, err := json.Marshal(f)
-					if err == nil {
-						if contentExists {
-							content += "\n" + string(j)
-						} else {
-							content = string(j)
-						}
-					}
-				}
-				// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
-				if content == "" {
-					if r != "" {
-						if contentExists {
-							content = fmt.Sprint(r, i.StringContent)
-						}
-
-						if i.FunctionCall != nil {
-							marshalAnyRole(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAnyRole(i.ToolCalls)
-						}
-					} else {
-						if contentExists {
-							content = fmt.Sprint(i.StringContent)
-						}
-						if i.FunctionCall != nil {
-							marshalAny(i.FunctionCall)
-						}
-						if i.ToolCalls != nil {
-							marshalAny(i.ToolCalls)
-						}
-					}
-					// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
-					if contentExists && role == "system" {
-						suppressConfigSystemPrompt = true
-					}
-				}
-
-				mess = append(mess, content)
-			}
-
-			joinCharacter := "\n"
-			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
-				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
-			}
-
-			predInput = strings.Join(mess, joinCharacter)
-			log.Debug().Msgf("Prompt (before templating): %s", predInput)
-
-			templateFile := ""
-
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-				templateFile = config.Model
-			}
-
-			if config.TemplateConfig.Chat != "" && !shouldUseFn {
-				templateFile = config.TemplateConfig.Chat
-			}
-
-			if config.TemplateConfig.Functions != "" && shouldUseFn {
-				templateFile = config.TemplateConfig.Functions
-			}
-
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt:         config.SystemPrompt,
-					SuppressSystemPrompt: suppressConfigSystemPrompt,
-					Input:                predInput,
-					Functions:            funcs,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				} else {
-					log.Debug().Msgf("Template failed loading: %s", err.Error())
-				}
-			}
+			predInput = evaluator.TemplateMessages(input.Messages, config, funcs, shouldUseFn)
 
 			log.Debug().Msgf("Prompt (after templating): %s", predInput)
-			if shouldUseFn && config.Grammar != "" {
+			if config.Grammar != "" {
 				log.Debug().Msgf("Grammar: %+v", config.Grammar)
 			}
 		}
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index e5de1b3f..04ebc847 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -16,6 +16,7 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -25,7 +26,7 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
-func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
 
@@ -94,17 +95,6 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			c.Set("Transfer-Encoding", "chunked")
 		}
 
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Completion != "" {
-			templateFile = config.TemplateConfig.Completion
-		}
-
 		if input.Stream {
 			if len(config.PromptStrings) > 1 {
 				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
@@ -112,15 +102,13 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 
 			predInput := config.PromptStrings[0]
 
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        predInput,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					predInput = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", predInput)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
+				Input:        predInput,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
 			}
 
 			responses := make(chan schema.OpenAIResponse)
@@ -165,16 +153,13 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 		totalTokenUsage := backend.TokenUsage{}
 
 		for k, i := range config.PromptStrings {
-			if templateFile != "" {
-				// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
-					SystemPrompt: config.SystemPrompt,
-					Input:        i,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.CompletionPromptTemplate, *config, templates.PromptTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
 
 			r, tokenUsage, err := ComputeChoices(
diff --git a/core/http/endpoints/openai/edit.go b/core/http/endpoints/openai/edit.go
index 12fb4035..a6d609fb 100644
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -12,6 +12,7 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
 	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/templates"
 
 	"github.com/rs/zerolog/log"
 )
@@ -21,7 +22,8 @@ import (
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/edits [post]
-func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+
 	return func(c *fiber.Ctx) error {
 		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
 		if err != nil {
@@ -35,31 +37,18 @@ func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConf
 
 		log.Debug().Msgf("Parameter Config: %+v", config)
 
-		templateFile := ""
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		if ml.ExistsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
-			templateFile = config.Model
-		}
-
-		if config.TemplateConfig.Edit != "" {
-			templateFile = config.TemplateConfig.Edit
-		}
-
 		var result []schema.Choice
 		totalTokenUsage := backend.TokenUsage{}
 
 		for _, i := range config.InputStrings {
-			if templateFile != "" {
-				templatedInput, err := ml.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
-					Input:        i,
-					Instruction:  input.Instruction,
-					SystemPrompt: config.SystemPrompt,
-				})
-				if err == nil {
-					i = templatedInput
-					log.Debug().Msgf("Template found, input modified to: %s", i)
-				}
+			templatedInput, err := evaluator.EvaluateTemplateForPrompt(templates.EditPromptTemplate, *config, templates.PromptTemplateData{
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
 			}
 
 			r, tokenUsage, err := ComputeChoices(input, i, config, appConfig, ml, func(s string, c *[]schema.Choice) {
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index e7097741..2ea9896a 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -11,62 +11,62 @@ import (
 	"github.com/mudler/LocalAI/pkg/model"
 )
 
-func RegisterLocalAIRoutes(app *fiber.App,
+func RegisterLocalAIRoutes(router *fiber.App,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	galleryService *services.GalleryService) {
 
-	app.Get("/swagger/*", swagger.HandlerDefault) // default
+	router.Get("/swagger/*", swagger.HandlerDefault) // default
 
 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
 		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
-		app.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
-		app.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
+		router.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
+		router.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
 
-		app.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
-		app.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
-		app.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
-		app.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
-		app.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
-		app.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
+		router.Get("/models/available", modelGalleryEndpointService.ListModelFromGalleryEndpoint())
+		router.Get("/models/galleries", modelGalleryEndpointService.ListModelGalleriesEndpoint())
+		router.Post("/models/galleries", modelGalleryEndpointService.AddModelGalleryEndpoint())
+		router.Delete("/models/galleries", modelGalleryEndpointService.RemoveModelGalleryEndpoint())
+		router.Get("/models/jobs/:uuid", modelGalleryEndpointService.GetOpStatusEndpoint())
+		router.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}
 
-	app.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
-	app.Post("/vad", localai.VADEndpoint(cl, ml, appConfig))
+	router.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
+	router.Post("/vad", localai.VADEndpoint(cl, ml, appConfig))
 
 	// Stores
 	sl := model.NewModelLoader("")
-	app.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	app.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	app.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	app.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
 
 	if !appConfig.DisableMetrics {
-		app.Get("/metrics", localai.LocalAIMetricsEndpoint())
+		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
 	}
 
 	// Experimental Backend Statistics Module
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
-	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
-	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
+	router.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
+	router.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
 
 	// p2p
 	if p2p.IsP2PEnabled() {
-		app.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
-		app.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
+		router.Get("/api/p2p", localai.ShowP2PNodes(appConfig))
+		router.Get("/api/p2p/token", localai.ShowP2PToken(appConfig))
 	}
 
-	app.Get("/version", func(c *fiber.Ctx) error {
+	router.Get("/version", func(c *fiber.Ctx) error {
 		return c.JSON(struct {
 			Version string `json:"version"`
 		}{Version: internal.PrintableVersion()})
 	})
 
-	app.Get("/system", localai.SystemInformations(ml, appConfig))
+	router.Get("/system", localai.SystemInformations(ml, appConfig))
 
 	// misc
-	app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
+	router.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
 
 }
diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go
index 081daf70..5ff301b6 100644
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -2,84 +2,134 @@ package routes
 
 import (
 	"github.com/gofiber/fiber/v2"
-	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
-	"github.com/mudler/LocalAI/pkg/model"
 )
 
 func RegisterOpenAIRoutes(app *fiber.App,
-	cl *config.BackendConfigLoader,
-	ml *model.ModelLoader,
-	appConfig *config.ApplicationConfig) {
+	application *application.Application) {
 	// openAI compatible API endpoint
 
 	// chat
-	app.Post("/v1/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
-	app.Post("/chat/completions", openai.ChatEndpoint(cl, ml, appConfig))
+	app.Post("/v1/chat/completions",
+		openai.ChatEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/chat/completions",
+		openai.ChatEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// edit
-	app.Post("/v1/edits", openai.EditEndpoint(cl, ml, appConfig))
-	app.Post("/edits", openai.EditEndpoint(cl, ml, appConfig))
+	app.Post("/v1/edits",
+		openai.EditEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/edits",
+		openai.EditEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// assistant
-	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Get("/assistants", openai.ListAssistantsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants", openai.CreateAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(cl, ml, appConfig))
-	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
-	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(cl, ml, appConfig))
+	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// files
-	app.Post("/v1/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Post("/files", openai.UploadFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/files", openai.ListFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Get("/files/:file_id", openai.GetFilesEndpoint(cl, appConfig))
-	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(cl, appConfig))
-	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
-	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(cl, appConfig))
+	app.Post("/v1/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Post("/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
+	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 
 	// completion
-	app.Post("/v1/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/completions", openai.CompletionEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/completions", openai.CompletionEndpoint(cl, ml, appConfig))
+	app.Post("/v1/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
+
+	app.Post("/v1/engines/:model/completions",
+		openai.CompletionEndpoint(
+			application.BackendLoader(),
+			application.ModelLoader(),
+			application.TemplatesEvaluator(),
+			application.ApplicationConfig(),
+		),
+	)
 
 	// embeddings
-	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
-	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(cl, ml, appConfig))
+	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// audio
-	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(cl, ml, appConfig))
-	app.Post("/v1/audio/speech", localai.TTSEndpoint(cl, ml, appConfig))
+	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+	app.Post("/v1/audio/speech", localai.TTSEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
 	// images
-	app.Post("/v1/images/generations", openai.ImageEndpoint(cl, ml, appConfig))
+	app.Post("/v1/images/generations", openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 
-	if appConfig.ImageDir != "" {
-		app.Static("/generated-images", appConfig.ImageDir)
+	if application.ApplicationConfig().ImageDir != "" {
+		app.Static("/generated-images", application.ApplicationConfig().ImageDir)
 	}
 
-	if appConfig.AudioDir != "" {
-		app.Static("/generated-audio", appConfig.AudioDir)
+	if application.ApplicationConfig().AudioDir != "" {
+		app.Static("/generated-audio", application.ApplicationConfig().AudioDir)
 	}
 
 	// List models
-	app.Get("/v1/models", openai.ListModelsEndpoint(cl, ml))
-	app.Get("/models", openai.ListModelsEndpoint(cl, ml))
+	app.Get("/v1/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader()))
+	app.Get("/models", openai.ListModelsEndpoint(application.BackendLoader(), application.ModelLoader()))
 }
diff --git a/docs/content/docs/features/image-generation.md b/docs/content/docs/features/image-generation.md
index 5bd12575..864ea040 100644
--- a/docs/content/docs/features/image-generation.md
+++ b/docs/content/docs/features/image-generation.md
@@ -194,8 +194,9 @@ diffusers:
   pipeline_type: StableDiffusionPipeline
   enable_parameters: "negative_prompt,num_inference_steps,clip_skip"
   scheduler_type: "k_dpmpp_sde"
-  cfg_scale: 8
   clip_skip: 11
+
+cfg_scale: 8
 ```
 
 #### Configuration parameters
@@ -302,7 +303,8 @@ cuda: true
 diffusers:
   pipeline_type: StableDiffusionDepth2ImgPipeline
   enable_parameters: "negative_prompt,num_inference_steps,image"
-  cfg_scale: 6
+
+cfg_scale: 6
 ```
 
 ```bash
diff --git a/docs/content/docs/getting-started/kubernetes.md b/docs/content/docs/getting-started/kubernetes.md
index fb08b046..aea28f3e 100644
--- a/docs/content/docs/getting-started/kubernetes.md
+++ b/docs/content/docs/getting-started/kubernetes.md
@@ -10,13 +10,13 @@ ico = "rocket_launch"
 For installing LocalAI in Kubernetes, the deployment file from the `examples` can be used and customized as prefered:
 
 ```
-kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI/master/examples/kubernetes/deployment.yaml
+kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI-examples/refs/heads/main/kubernetes/deployment.yaml
 ```
 
 For Nvidia GPUs:
 
 ```
-kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI/master/examples/kubernetes/deployment-nvidia.yaml
+kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI-examples/refs/heads/main/kubernetes/deployment-nvidia.yaml
 ```
 
 Alternatively, the [helm chart](https://github.com/go-skynet/helm-charts) can be used as well:
diff --git a/docs/content/docs/reference/compatibility-table.md b/docs/content/docs/reference/compatibility-table.md
index f76ad85d..c3bf2660 100644
--- a/docs/content/docs/reference/compatibility-table.md
+++ b/docs/content/docs/reference/compatibility-table.md
@@ -6,7 +6,7 @@ weight = 24
 url = "/model-compatibility/"
 +++
 
-Besides llama based models, LocalAI is compatible also with other architectures. The table below lists all the compatible models families and the associated binding repository.
+Besides llama based models, LocalAI is compatible also with other architectures. The table below lists all the backends, compatible models families and the associated repository.
 
 {{% alert note %}}
 
@@ -16,19 +16,8 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 
 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | Vicuna, Alpaca, LLaMa, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
-| [gpt4all-llama](https://github.com/nomic-ai/gpt4all)      | Vicuna, Alpaca, LLaMa | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [gpt4all-mpt](https://github.com/nomic-ai/gpt4all)          | MPT                   | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [gpt4all-j](https://github.com/nomic-ai/gpt4all)           | GPT4ALL-J             | yes                      | GPT                        | no                                | yes                  | N/A  |
-| [falcon-ggml](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | Falcon (*)             | yes                      | GPT                        | no                                | no                   | N/A |
-| [dolly](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))            | Dolly                 | yes                      | GPT                        | no                                | no                   | N/A |
-| [gptj](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | GPTJ             | yes                      | GPT                        | no                                | no                   | N/A |
-| [mpt](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))         | MPT     | yes                      | GPT                        | no                                | no                   | N/A |
-| [replit](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | Replit             | yes                      | GPT                        | no                                | no                   | N/A |
-| [gptneox](https://github.com/ggerganov/ggml) ([binding](https://github.com/go-skynet/go-ggml-transformers.cpp))        | GPT NeoX, RedPajama, StableLM             | yes                      | GPT                        | no                                | no                   | N/A |
-| [bloomz](https://github.com/NouamaneTazi/bloomz.cpp) ([binding](https://github.com/go-skynet/bloomz.cpp))       | Bloom                 | yes                      | GPT                        | no                                | no                   | N/A |
-| [rwkv](https://github.com/saharNooby/rwkv.cpp) ([binding](https://github.com/donomii/go-rwkv.cpp))       | rwkv                 | yes                      | GPT                        | no                                | yes                   | N/A  |
-| [bert](https://github.com/skeskinen/bert.cpp) ([binding](https://github.com/go-skynet/go-bert.cpp)) | bert                  | no                       | Embeddings only                  | yes                               | no                   | N/A |
+| [llama.cpp]({{%relref "docs/features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
+| [llama.cpp's ggml model (backward compatibility with old format, before GGUF)](https://github.com/ggerganov/llama.cpp) ([binding](https://github.com/go-skynet/go-llama.cpp))  | LLama, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes** | yes                  | CUDA, openCL, cuBLAS, Metal |
 | [whisper](https://github.com/ggerganov/whisper.cpp)         | whisper               | no                       | Audio                 | no                                | no                   | N/A |
 | [stablediffusion](https://github.com/EdVince/Stable-Diffusion-NCNN) ([binding](https://github.com/mudler/go-stable-diffusion))        | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
@@ -40,11 +29,18 @@ LocalAI will attempt to automatically load models which are not explicitly confi
 | `diffusers`  | SD,...                   | no                       | Image generation    | no                               | no                   | N/A |
 | `vall-e-x` | Vall-E    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | `vllm` | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CPU/CUDA |
+| `mamba` | Mamba models architecture | yes                      | GPT             | no | no                  | CPU/CUDA |
 | `exllama2`  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | N/A |
 | `transformers-musicgen`  |                    | no                       | Audio generation                | no                               | no                   | N/A |
 | [tinydream](https://github.com/symisc/tiny-dream#tiny-dreaman-embedded-header-only-stable-diffusion-inference-c-librarypixlabiotiny-dream)         | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
 | `coqui` | Coqui    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| `openvoice` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| `parler-tts` | Open voice    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CPU/CUDA |
 | `transformers` | Various GPTs and quantization formats | yes                      | GPT, embeddings            | yes | yes****                  | CPU/CUDA/XPU |
+| [bark-cpp](https://github.com/PABannier/bark.cpp)        | bark               | no                       | Audio-Only                 | no                                | no                   | yes |
+| [stablediffusion-cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | N/A |
+| [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |
 
 Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "docs/advanced" %}})).
 
diff --git a/docs/data/version.json b/docs/data/version.json
index 20611657..bf065426 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.23.0"
+  "version": "v2.24.2"
 }
diff --git a/docs/themes/hugo-theme-relearn b/docs/themes/hugo-theme-relearn
index 28fce6b0..bd1f3d34 160000
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
@@ -1 +1 @@
-Subproject commit 28fce6b04c414523280c53ee02f9f3a94d9d23da
+Subproject commit bd1f3d3432632c61bb12e7ec0f7673fed0289f19
diff --git a/gallery/flux-ggml.yaml b/gallery/flux-ggml.yaml
new file mode 100644
index 00000000..5738d584
--- /dev/null
+++ b/gallery/flux-ggml.yaml
@@ -0,0 +1,12 @@
+---
+name: "flux-ggml"
+
+config_file: |
+    backend: stablediffusion-ggml
+    step: 25
+    options:
+    - "diffusion_model"
+    - "clip_l_path:clip_l.safetensors"
+    - "t5xxl_path:t5xxl_fp16.safetensors"
+    - "vae_path:ae.safetensors"
+    - "sampler:euler"
diff --git a/gallery/flux.yaml b/gallery/flux.yaml
index bb75b53b..a859d801 100644
--- a/gallery/flux.yaml
+++ b/gallery/flux.yaml
@@ -11,4 +11,5 @@ config_file: |
     cuda: true
     enable_parameters: num_inference_steps
     pipeline_type: FluxPipeline
-    cfg_scale: 0
+
+  cfg_scale: 0
diff --git a/gallery/index.yaml b/gallery/index.yaml
index b4b73e4b..99c0e9a3 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,82 @@
 ---
+- &intellect1
+  name: "intellect-1-instruct"
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://huggingface.co/PrimeIntellect/INTELLECT-1-Instruct/resolve/main/intellect-1-map.png
+  urls:
+    - https://huggingface.co/PrimeIntellect/INTELLECT-1-Instruct
+    - https://huggingface.co/bartowski/INTELLECT-1-Instruct-GGUF
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - intellect
+  license: apache-2.0
+  description: |
+    INTELLECT-1 is the first collaboratively trained 10 billion parameter language model trained from scratch on 1 trillion tokens of English text and code.
+    This is an instruct model. The base model associated with it is INTELLECT-1.
+    INTELLECT-1 was trained on up to 14 concurrent nodes distributed across 3 continents, with contributions from 30 independent community contributors providing compute. The training code utilizes the prime framework, a scalable distributed training framework designed for fault-tolerant, dynamically scaling, high-perfomance training on unreliable, globally distributed workers. The key abstraction that allows dynamic scaling is the ElasticDeviceMesh which manages dynamic global process groups for fault-tolerant communication across the internet and local process groups for communication within a node. The model was trained using the DiLoCo algorithms with 100 inner steps. The global all-reduce was done with custom int8 all-reduce kernels to reduce the communication payload required, greatly reducing the communication overhead by a factor 400x.
+  overrides:
+    parameters:
+      model: INTELLECT-1-Instruct-Q4_K_M.gguf
+  files:
+    - filename: INTELLECT-1-Instruct-Q4_K_M.gguf
+      sha256: 5df236fe570e5998d07fb3207788eac811ef3b77dd2a0ad04a2ef5c6361f3030
+      uri: huggingface://bartowski/INTELLECT-1-Instruct-GGUF/INTELLECT-1-Instruct-Q4_K_M.gguf
+- &llama33
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
+  license: llama3.3
+  description: |
+    The Meta Llama 3.3 multilingual large language model (LLM) is a pretrained and instruction tuned generative model in 70B (text in/text out). The Llama 3.3 instruction tuned text only model is optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - llama3.3
+  name: "llama-3.3-70b-instruct"
+  urls:
+    - https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct
+    - https://huggingface.co/MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF
+  overrides:
+    parameters:
+      model: Llama-3.3-70B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Llama-3.3-70B-Instruct.Q4_K_M.gguf
+      sha256: 4f3b04ecae278bdb0fd545b47c210bc5edf823e5ebf7d41e0b526c81d54b1ff3
+      uri: huggingface://MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-70b-euryale-v2.3"
+  icon: https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3/resolve/main/Eury.png
+  urls:
+    - https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3
+    - https://huggingface.co/bartowski/L3.3-70B-Euryale-v2.3-GGUF
+  description: |
+    A direct replacement / successor to Euryale v2.2, not Hanami-x1, though it is slightly better than them in my opinion.
+  overrides:
+    parameters:
+      model: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+  files:
+    - filename: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+      sha256: 4e78bb0e65886bfcff89b829f6d38aa6f6846988bb8291857e387e3f60b3217b
+      uri: huggingface://bartowski/L3.3-70B-Euryale-v2.3-GGUF/L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-ms-evayale-70b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/HFCaVzRpiE05Y46p41qRy.webp
+  urls:
+    - https://huggingface.co/Steelskull/L3.3-MS-Evayale-70B
+    - https://huggingface.co/bartowski/L3.3-MS-Evayale-70B-GGUF
+  description: |
+    This model was created as I liked the storytelling of EVA but the prose and details of scenes from EURYALE, my goal is to merge the robust storytelling of both models while attempting to maintain the positives of both models.
+  overrides:
+    parameters:
+      model: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+  files:
+    - filename: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+      sha256: f941d88870fec8343946517a1802d159d23f3971eeea50b6cf12295330bd29cc
+      uri: huggingface://bartowski/L3.3-MS-Evayale-70B-GGUF/L3.3-MS-Evayale-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"
@@ -380,7 +458,6 @@
   urls:
     - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
   overrides:
-    embeddings: true
     parameters:
       model: llama-3.2-1b-instruct-q4_k_m.gguf
   files:
@@ -704,6 +781,45 @@
     - filename: Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
       uri: huggingface://QuantFactory/Llama-Sentient-3.2-3B-Instruct-GGUF/Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
       sha256: 3f855ce0522bfdc39fc826162ba6d89f15cc3740c5207da10e70baa3348b7812
+- !!merge <<: *llama32
+  name: "llama-smoltalk-3.2-1b-instruct"
+  urls:
+    - https://huggingface.co/prithivMLmods/Llama-SmolTalk-3.2-1B-Instruct
+    - https://huggingface.co/mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF
+  description: |
+    The Llama-SmolTalk-3.2-1B-Instruct model is a lightweight, instruction-tuned model designed for efficient text generation and conversational AI tasks. With a 1B parameter architecture, this model strikes a balance between performance and resource efficiency, making it ideal for applications requiring concise, contextually relevant outputs. The model has been fine-tuned to deliver robust instruction-following capabilities, catering to both structured and open-ended queries.
+    Key Features:
+
+        Instruction-Tuned Performance: Optimized to understand and execute user-provided instructions across diverse domains.
+        Lightweight Architecture: With just 1 billion parameters, the model provides efficient computation and storage without compromising output quality.
+        Versatile Use Cases: Suitable for tasks like content generation, conversational interfaces, and basic problem-solving.
+
+    Intended Applications:
+
+        Conversational AI: Engage users with dynamic and contextually aware dialogue.
+        Content Generation: Produce summaries, explanations, or other creative text outputs efficiently.
+        Instruction Execution: Follow user commands to generate precise and relevant responses.
+  overrides:
+    parameters:
+      model: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+      sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e
+      uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "fusechat-llama-3.2-3b-instruct"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Llama-3.2-3B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+      sha256: a4f0e9a905b74886b79b72622c06a3219d6812818a564a53c39fc49032d7f842
+      uri: huggingface://bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF/FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
 - &qwen25
   ## Qwen2.5
   name: "qwen2.5-14b-instruct"
@@ -1631,6 +1747,319 @@
     - filename: EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
       sha256: 03ea0ecac3ee24a332ca43cf925b669c58714b9754be0f4bc232bd996681ef4b
       uri: huggingface://bartowski/EVA-Qwen2.5-72B-v0.2-GGUF/EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwq-32b-preview"
+  urls:
+    - https://huggingface.co/Qwen/QwQ-32B-Preview
+    - https://huggingface.co/bartowski/QwQ-32B-Preview-GGUF
+  description: |
+    QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. As a preview release, it demonstrates promising analytical abilities while having several important limitations:
+
+    Language Mixing and Code-Switching: The model may mix languages or switch between them unexpectedly, affecting response clarity.
+    Recursive Reasoning Loops: The model may enter circular reasoning patterns, leading to lengthy responses without a conclusive answer.
+    Safety and Ethical Considerations: The model requires enhanced safety measures to ensure reliable and secure performance, and users should exercise caution when deploying it.
+    Performance and Benchmark Limitations: The model excels in math and coding but has room for improvement in other areas, such as common sense reasoning and nuanced language understanding.
+  overrides:
+    parameters:
+      model: QwQ-32B-Preview-Q4_K_M.gguf
+  files:
+    - filename: QwQ-32B-Preview-Q4_K_M.gguf
+      sha256: c499801e682e2379528090c50e106837ca1d69dc3bf3ff3a9af830a0eb49cdf6
+      uri: huggingface://bartowski/QwQ-32B-Preview-GGUF/QwQ-32B-Preview-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "q2.5-32b-slush-i1"
+  urls:
+    - https://huggingface.co/crestf411/Q2.5-32B-Slush
+    - https://huggingface.co/mradermacher/Q2.5-32B-Slush-i1-GGUF
+  description: |
+    Slush is a two-stage model trained with high LoRA dropout, where stage 1 is a pretraining continuation on the base model, aimed at boosting the model's creativity and writing capabilities. This is then merged into the instruction tune model, and stage 2 is a fine tuning step on top of this to further enhance its roleplaying capabilities and/or to repair any damage caused in the stage 1 merge.
+    This is still early stage. As always, feedback is welcome, and begone if you demand perfection.
+    The second stage, like the Sunfall series, follows the Silly Tavern preset (ChatML), so ymmv in particular if you use some other tool and/or preset.
+  overrides:
+    parameters:
+      model: Q2.5-32B-Slush.i1-Q4_K_M.gguf
+  files:
+    - filename: Q2.5-32B-Slush.i1-Q4_K_M.gguf
+      sha256: 95aecaf43077dabc72d3b556923ede2563325e1c89863800229cfa8b7f1c9659
+      uri: huggingface://mradermacher/Q2.5-32B-Slush-i1-GGUF/Q2.5-32B-Slush.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwestion-24b"
+  urls:
+    - https://huggingface.co/CultriX/Qwestion-14B
+    - https://huggingface.co/mradermacher/Qwestion-24B-GGUF
+  description: |
+    This model was merged using the DARE TIES merge method using Qwen/Qwen2.5-14B as a base.
+    The following models were included in the merge:
+    allknowingroger/Qwenslerp2-14B
+    rombodawg/Rombos-LLM-V2.6-Qwen-14b
+    VAGOsolutions/SauerkrautLM-v2-14b-DPO
+    CultriX/Qwen2.5-14B-Wernicke
+  overrides:
+    parameters:
+      model: Qwestion-24B.Q4_K_M.gguf
+  files:
+    - filename: Qwestion-24B.Q4_K_M.gguf
+      sha256: 5d493bd81cfeef66d80101260145ab1d1d0428ef2191edce62b58391bd0fff0e
+      uri: huggingface://mradermacher/Qwestion-24B-GGUF/Qwestion-24B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "teleut-7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/UqIi8eztdptvt52Mak_1K.png
+  urls:
+    - https://huggingface.co/allura-org/Teleut-7b
+    - https://huggingface.co/QuantFactory/Teleut-7b-GGUF
+  description: |
+    A replication attempt of Tulu 3 on the Qwen 2.5 base models.
+  overrides:
+    parameters:
+      model: Teleut-7b.Q4_K_M.gguf
+  files:
+    - filename: Teleut-7b.Q4_K_M.gguf
+      sha256: 844a633ea01d793c638e99f2e07413606b3812b759e9264fbaf69c8d94eaa093
+      uri: huggingface://QuantFactory/Teleut-7b-GGUF/Teleut-7b.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-7b-homercreative-mix"
+  urls:
+    - https://huggingface.co/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
+    - https://huggingface.co/QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF
+  description: |
+    ZeroXClem/Qwen2.5-7B-HomerCreative-Mix is an advanced language model meticulously crafted by merging four pre-trained models using the powerful mergekit framework. This fusion leverages the Model Stock merge method to combine the creative prowess of Qandora, the instructive capabilities of Qwen-Instruct-Fusion, the sophisticated blending of HomerSlerp1, and the foundational conversational strengths of Homer-v0.5-Qwen2.5-7B. The resulting model excels in creative text generation, contextual understanding, and dynamic conversational interactions.
+    🚀 Merged Models
+
+    This model merge incorporates the following:
+
+        bunnycore/Qandora-2.5-7B-Creative: Specializes in creative text generation, enhancing the model's ability to produce imaginative and diverse content.
+
+        bunnycore/Qwen2.5-7B-Instruct-Fusion: Focuses on instruction-following capabilities, improving the model's performance in understanding and executing user commands.
+
+        allknowingroger/HomerSlerp1-7B: Utilizes spherical linear interpolation (SLERP) to blend model weights smoothly, ensuring a harmonious integration of different model attributes.
+
+        newsbang/Homer-v0.5-Qwen2.5-7B: Acts as the foundational conversational model, providing robust language comprehension and generation capabilities.
+  overrides:
+    parameters:
+      model: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+      sha256: fc3fdb41e068646592f89a8ae62d7b330f2bd4e97bf615aef2977930977c8ba5
+      uri: huggingface://QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF/Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "cybercore-qwen-2.1-7b"
+  urls:
+    - https://huggingface.co/bunnycore/CyberCore-Qwen-2.1-7B
+    - https://huggingface.co/QuantFactory/CyberCore-Qwen-2.1-7B-GGUF
+  description: |
+    This model was merged using the TIES merge method using rombodawg/Rombos-LLM-V2.5-Qwen-7b as a base.
+    Models Merged
+    fblgit/cybertron-v4-qw7B-UNAMGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
+    fblgit/cybertron-v4-qw7B-MGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
+  overrides:
+    parameters:
+      model: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+  files:
+    - filename: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+      sha256: 726042707a4cec29ca0355b4dc7c53a807b307d08aa8a3d4a9e76aefbbbcaadf
+      uri: huggingface://QuantFactory/CyberCore-Qwen-2.1-7B-GGUF/CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "homercreativeanvita-mix-qw7b"
+  icon: https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B/resolve/main/HomerCreativeAnvita.jpeg
+  urls:
+    - https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B
+    - https://huggingface.co/QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF
+  description: |
+    This model is currently ranked #1 on the Open LLM Leaderboard among models up to 13B parameters!
+    Merge Method
+
+    This model was merged using the SLERP merge method.
+    Models Merged
+
+    The following models were included in the merge:
+
+        ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix
+        ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
+  overrides:
+    parameters:
+      model: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+  files:
+    - filename: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+      sha256: a356f279a104bff0bbc2ef7ec136c1e774153de8893bf988083e96fb7f4bc053
+      uri: huggingface://QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF/HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "math-iio-7b-instruct"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/faLfR-doaWP_BLUkOQrbq.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Math-IIO-7B-Instruct
+    - https://huggingface.co/QuantFactory/Math-IIO-7B-Instruct-GGUF
+  description: |
+    The Math IIO 7B Instruct is a fine-tuned language model based on the robust Qwen2.5-7B-Instruct architecture. This model has been specifically trained to excel in single-shot mathematical reasoning and instruction-based tasks, making it a reliable choice for educational, analytical, and problem-solving applications.
+    Key Features:
+      Math-Optimized Capabilities:
+      The model is designed to handle complex mathematical problems, step-by-step calculations, and reasoning tasks.
+
+      Instruction-Tuned:
+      Fine-tuned for better adherence to structured queries and task-oriented prompts, enabling clear and concise outputs.
+
+      Large Vocabulary:
+      Equipped with an extensive tokenizer configuration and custom tokens to ensure precise mathematical notation support.
+  overrides:
+    parameters:
+      model: Math-IIO-7B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Math-IIO-7B-Instruct.Q4_K_M.gguf
+      sha256: 8ffda0b6a43eb9997dfd7db48fe3bd0970fd1b9b86fb68f082c38622a48b58f4
+      uri: huggingface://QuantFactory/Math-IIO-7B-Instruct-GGUF/Math-IIO-7B-Instruct.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "virtuoso-small"
+  icon: https://i.ibb.co/pXD6Bcv/SW2-U-g-QQLSH1-ZAbxhs-Iu-A.webp
+  urls:
+    - https://huggingface.co/arcee-ai/Virtuoso-Small-GGUF
+  description: |
+    Virtuoso-Small is the debut public release of the Virtuoso series of models by Arcee.ai, designed to bring cutting-edge generative AI capabilities to organizations and developers in a compact, efficient form. With 14 billion parameters, Virtuoso-Small is an accessible entry point for high-quality instruction-following, complex reasoning, and business-oriented generative AI tasks.
+  overrides:
+    parameters:
+      model: Virtuoso-Small-Q4_K_M.gguf
+  files:
+    - filename: Virtuoso-Small-Q4_K_M.gguf
+      sha256: 07db215cdfcb05036567017fe20e50e60cb2da28d1f9a8251cc4f18c8caa247f
+      uri: huggingface://arcee-ai/Virtuoso-Small-GGUF/Virtuoso-Small-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-7b-homeranvita-nerdmix"
+  urls:
+    - https://huggingface.co/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix
+    - https://huggingface.co/QuantFactory/Qwen2.5-7B-HomerAnvita-NerdMix-GGUF
+  description: |
+    ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix is an advanced language model meticulously crafted by merging five pre-trained models using the powerful mergekit framework. This fusion leverages the Model Stock merge method to combine the creative prowess of Qandora, the instructive capabilities of Qwen-Instruct-Fusion, the sophisticated blending of HomerSlerp1, the mathematical precision of Cybertron-MGS, and the uncensored expertise of Qwen-Nerd. The resulting model excels in creative text generation, contextual understanding, technical reasoning, and dynamic conversational interactions.
+  overrides:
+    parameters:
+      model: Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+      sha256: 73db2ca3ab50e8627352078988cd173e7447c5e8199a7db9e554602da1362e5f
+      uri: huggingface://QuantFactory/Qwen2.5-7B-HomerAnvita-NerdMix-GGUF/Qwen2.5-7B-HomerAnvita-NerdMix.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-math-14b-instruct"
+  urls:
+    - https://huggingface.co/qingy2024/Qwen2.5-Math-14B-Instruct-Preview
+    - https://huggingface.co/QuantFactory/Qwen2.5-Math-14B-Instruct-GGUF
+  description: |
+    This Qwen 2.5 model was trained 2x faster with Unsloth and Huggingface's TRL library.
+    Fine-tuned it for 400 steps on garage-bAInd/Open-Platypus with a batch size of 3.
+  overrides:
+    parameters:
+      model: Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+      sha256: 14e672394738a7d9f14a6cb16fd9a649b113a19a8b4934f9c18299fc4e286ab6
+      uri: huggingface://QuantFactory/Qwen2.5-Math-14B-Instruct-GGUF/Qwen2.5-Math-14B-Instruct.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "sailor2-1b-chat"
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  urls:
+    - https://huggingface.co/sail/Sailor2-1B-Chat
+    - https://huggingface.co/bartowski/Sailor2-1B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-1B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-1B-Chat-Q4_K_M.gguf
+      sha256: 782e8abed13d51a2083eadfb2f6d94c2cd77940532f612a99e6f6bec9b3501d4
+      uri: huggingface://bartowski/Sailor2-1B-Chat-GGUF/Sailor2-1B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  name: "sailor2-8b-chat"
+  urls:
+    - https://huggingface.co/bartowski/Sailor2-8B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-8B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-8B-Chat-Q4_K_M.gguf
+      sha256: 1a6aaadd6f6ef9c2290d66b348ebcbd6fdec542834cde622498fbd467d966103
+      uri: huggingface://bartowski/Sailor2-8B-Chat-GGUF/Sailor2-8B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "sailor2-20b-chat"
+  icon: https://huggingface.co/sail/Sailor2-1B-Chat/resolve/main/sailor2_banner.jpg
+  urls:
+    - https://huggingface.co/bartowski/Sailor2-20B-Chat-GGUF
+  description: |
+    Sailor2 is a community-driven initiative that brings cutting-edge multilingual language models to South-East Asia (SEA). Our research highlights a strong demand for models in the 8B and 20B parameter range for production use, alongside 1B models for specialized applications, such as speculative decoding and research purposes. These models, released under the Apache 2.0 license, provide enhanced accessibility to advanced language technologies across the region.
+    Sailor2 builds upon the foundation of the awesome multilingual model Qwen 2.5 and is continuously pre-trained on 500B tokens to support 15 languages better with a unified model. These languages include English, Chinese, Burmese, Cebuano, Ilocano, Indonesian, Javanese, Khmer, Lao, Malay, Sundanese, Tagalog, Thai, Vietnamese, and Waray. By addressing the growing demand for diverse, robust, and accessible language models, Sailor2 seeks to serve the underserved in SEA areas with open, inclusive, and accessible multilingual LLMs. The Sailor2 model comes in three sizes, 1B, 8B, and 20B, which are expanded from the Qwen2.5 base models of 0.5B, 7B, and 14B, respectively.
+  overrides:
+    parameters:
+      model: Sailor2-20B-Chat-Q4_K_M.gguf
+  files:
+    - filename: Sailor2-20B-Chat-Q4_K_M.gguf
+      sha256: 0cf8fcd367accee19702ef15ee964bddd5035bde034afddd838f818e7655534a
+      uri: huggingface://bartowski/Sailor2-20B-Chat-GGUF/Sailor2-20B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "72b-qwen2.5-kunou-v1"
+  icon: https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1/resolve/main/knn.png
+  urls:
+    - https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1
+    - https://huggingface.co/bartowski/72B-Qwen2.5-Kunou-v1-GGUF
+  description: |
+    I do not really have anything planned for this model other than it being a generalist, and Roleplay Model? It was just something made and planned in minutes.
+    Same with the 14 and 32B version.
+    Kunou's the name of an OC I worked on for a couple of years, for a... fanfic. mmm...
+
+    A kind-of successor to L3-70B-Euryale-v2.2 in all but name? I'm keeping Stheno/Euryale lineage to Llama series for now.
+    I had a version made on top of Nemotron, a supposed Euryale 2.4 but that flopped hard, it was not my cup of tea.
+    This version is basically a better, more cleaned up Dataset used on Euryale and Stheno.
+  overrides:
+    parameters:
+      model: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+  files:
+    - filename: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+      sha256: 91907f29746625a62885793475956220b81d8a5a34b53686a1acd1d03fd403ea
+      uri: huggingface://bartowski/72B-Qwen2.5-Kunou-v1-GGUF/72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  icon: https://i.imgur.com/OxX2Usi.png
+  name: "evathene-v1.3"
+  urls:
+    - https://huggingface.co/sophosympatheia/Evathene-v1.3
+    - https://huggingface.co/bartowski/Evathene-v1.3-GGUF
+  description: |
+    This 72B parameter model is a merge of sophosympatheia/Evathene-v1.1 and sophosympatheia/Evathene-v1.2. See the merge recipe below for details.
+  overrides:
+    parameters:
+      model: Evathene-v1.3-Q4_K_M.gguf
+  files:
+    - filename: Evathene-v1.3-Q4_K_M.gguf
+      sha256: 0f54909b3ddca514994ee16417da8750f56e7bd59581b46ac47625c230e29d1f
+      uri: huggingface://bartowski/Evathene-v1.3-GGUF/Evathene-v1.3-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "fusechat-qwen-2.5-7b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+      sha256: 8cd8c317769f03125ac753c836ac92c5a76ee0b35502811d0e65bcbb8df9d55c
+      uri: huggingface://bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF/FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "neumind-math-7b-instruct"
+  urls:
+    - https://huggingface.co/prithivMLmods/Neumind-Math-7B-Instruct
+    - https://huggingface.co/QuantFactory/Neumind-Math-7B-Instruct-GGUF
+  description: |
+    The Neumind-Math-7B-Instruct is a fine-tuned model based on Qwen2.5-7B-Instruct, optimized for mathematical reasoning, step-by-step problem-solving, and instruction-based tasks in the mathematics domain. The model is designed for applications requiring structured reasoning, numerical computations, and mathematical proof generation.
+  overrides:
+    parameters:
+      model: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+      sha256: 3250abadeae4234e06dfaf7cf86fe871fe021e6c2dfcb4542c2a4f412d71e28c
+      uri: huggingface://QuantFactory/Neumind-Math-7B-Instruct-GGUF/Neumind-Math-7B-Instruct.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:
@@ -1684,6 +2113,20 @@
     - filename: Arch-Function-3B.Q4_K_M.gguf
       sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
       uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2-7b-multilingual-rp"
+  urls:
+    - https://huggingface.co/maywell/Qwen2-7B-Multilingual-RP
+    - https://huggingface.co/QuantFactory/Qwen2-7B-Multilingual-RP-GGUF
+  description: |
+    Multilingual Qwen2-7B model trained on Roleplaying.
+  overrides:
+    parameters:
+      model: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+  files:
+    - filename: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+      sha256: 31756c58fd135f2deb59b2d9b142f39134dc8d1a6eaa02f388dda7491fc95ccc
+      uri: huggingface://QuantFactory/Qwen2-7B-Multilingual-RP-GGUF/Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
 - &smollm
   ## SmolLM
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -2687,6 +3130,22 @@
     - filename: hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
       sha256: 8cff9d399a0583616fe1f290da6daa091ab5c5493d0e173a8fffb45202d79417
       uri: huggingface://mlabonne/Hermes-3-Llama-3.1-8B-lorablated-GGUF/hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "hermes-3-llama-3.2-3b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.2-3B
+    - https://huggingface.co/bartowski/Hermes-3-Llama-3.2-3B-GGUF
+  description: |
+    Hermes 3 3B is a small but mighty new addition to the Hermes series of LLMs by Nous Research, and is Nous's first fine-tune in this parameter class.
+    Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board.
+  overrides:
+    parameters:
+      model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  files:
+    - filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+      sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+      uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
 - !!merge <<: *llama31
   name: "doctoraifinetune-3.1-8b-i1"
   urls:
@@ -3223,6 +3682,252 @@
     - filename: Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
       sha256: 3fad2c96aa9b9de19c2cda0f88a381c47ac768ca03a95059d9f6c439791f8592
       uri: huggingface://bartowski/Llama-3.1-Tulu-3-8B-SFT-GGUF/Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
+- !!merge <<: *llama31
+  icon: https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B/resolve/main/misc/misc_fig.jpg
+  name: "skywork-o1-open-llama-3.1-8b"
+  urls:
+    - https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B
+    - https://huggingface.co/QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF
+  description: |
+    We are excited to announce the release of the Skywork o1 Open model series, developed by the Skywork team at Kunlun Inc. This groundbreaking release introduces a series of models that incorporate o1-like slow thinking and reasoning capabilities. The Skywork o1 Open model series includes three advanced models:
+
+    Skywork o1 Open-Llama-3.1-8B: A robust chat model trained on Llama-3.1-8B, enhanced significantly with "o1-style" data to improve reasoning skills.
+
+    Skywork o1 Open-PRM-Qwen-2.5-1.5B: A specialized model designed to enhance reasoning capability through incremental process rewards, ideal for complex problem solving at a smaller scale.
+
+    Skywork o1 Open-PRM-Qwen-2.5-7B: Extends the capabilities of the 1.5B model by scaling up to handle more demanding reasoning tasks, pushing the boundaries of AI reasoning.
+
+    Different from mere reproductions of the OpenAI o1 model, the Skywork o1 Open model series not only exhibits innate thinking, planning, and reflecting capabilities in its outputs, but also shows significant improvements in reasoning skills on standard benchmarks. This series represents a strategic advancement in AI capabilities, moving a previously weaker base model towards the state-of-the-art (SOTA) in reasoning tasks.
+  overrides:
+    parameters:
+      model: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+  files:
+    - filename: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+      sha256: ef6a203ba585aab14f5d2ec463917a45b3ac571abd89c39e9a96a5e395ea8eea
+      uri: huggingface://QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF/Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "sparse-llama-3.1-8b-2of4"
+  urls:
+    - https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
+    - https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
+  description: |
+    This is the 2:4 sparse version of Llama-3.1-8B. On the OpenLLM benchmark (version 1), it achieves an average score of 62.16, compared to 63.19 for the dense model—demonstrating a 98.37% accuracy recovery. On the Mosaic Eval Gauntlet benchmark (version v0.3), it achieves an average score of 53.85, versus 55.34 for the dense model—representing a 97.3% accuracy recovery.
+  overrides:
+    parameters:
+      model: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+  files:
+    - filename: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+      sha256: c481e7089ffaedd5ae8c74dccc7fb45f6509640b661fa086ae979f6fefc3fdba
+      uri: huggingface://QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF/Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "loki-v2.6-8b-1024k"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6472de046facfb01d8b1fb9d/uQPITKRS8XLTLyaiGwgh_.jpeg
+  urls:
+    - https://huggingface.co/QuantFactory/Loki-v2.6-8b-1024k-GGUF
+  description: |
+    The following models were included in the merge:
+    MrRobotoAI/Epic_Fiction-8b
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k
+    MrRobotoAI/Loki-.Epic_Fiction.-8b
+    Casual-Autopsy/L3-Luna-8B
+    Casual-Autopsy/L3-Super-Nova-RP-8B
+    Casual-Autopsy/L3-Umbral-Mind-RP-v3.0-8B
+    Casual-Autopsy/Halu-L3-Stheno-BlackOasis-8B
+    Undi95/Llama-3-LewdPlay-8B
+    Undi95/Llama-3-LewdPlay-8B-evo
+    Undi95/Llama-3-Unholy-8B
+    ChaoticNeutrals/Hathor_Tahsin-L3-8B-v0.9
+    ChaoticNeutrals/Hathor_RP-v.01-L3-8B
+    ChaoticNeutrals/Domain-Fusion-L3-8B
+    ChaoticNeutrals/T-900-8B
+    ChaoticNeutrals/Poppy_Porpoise-1.4-L3-8B
+    ChaoticNeutrals/Templar_v1_8B
+    ChaoticNeutrals/Hathor_Respawn-L3-8B-v0.8
+    ChaoticNeutrals/Sekhmet_Gimmel-L3.1-8B-v0.3
+    zeroblu3/LewdPoppy-8B-RP
+    tohur/natsumura-storytelling-rp-1.0-llama-3.1-8b
+    jeiku/Chaos_RP_l3_8B
+    tannedbum/L3-Nymeria-Maid-8B
+    Nekochu/Luminia-8B-RP
+    vicgalle/Humanish-Roleplay-Llama-3.1-8B
+    saishf/SOVLish-Maid-L3-8B
+    Dogge/llama-3-8B-instruct-Bluemoon-Freedom-RP
+    MrRobotoAI/Epic_Fiction-8b-v4
+    maldv/badger-lambda-0-llama-3-8b
+    maldv/llama-3-fantasy-writer-8b
+    maldv/badger-kappa-llama-3-8b
+    maldv/badger-mu-llama-3-8b
+    maldv/badger-lambda-llama-3-8b
+    maldv/badger-iota-llama-3-8b
+    maldv/badger-writer-llama-3-8b
+    Magpie-Align/MagpieLM-8B-Chat-v0.1
+    nbeerbower/llama-3-gutenberg-8B
+    nothingiisreal/L3-8B-Stheno-Horny-v3.3-32K
+    nbeerbower/llama-3-spicy-abliterated-stella-8B
+    Magpie-Align/MagpieLM-8B-SFT-v0.1
+    NeverSleep/Llama-3-Lumimaid-8B-v0.1
+    mlabonne/NeuralDaredevil-8B-abliterated
+    mlabonne/Daredevil-8B-abliterated
+    NeverSleep/Llama-3-Lumimaid-8B-v0.1-OAS
+    nothingiisreal/L3-8B-Instruct-Abliterated-DWP
+    openchat/openchat-3.6-8b-20240522
+    turboderp/llama3-turbcat-instruct-8b
+    UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3
+    Undi95/Llama-3-LewdPlay-8B
+    TIGER-Lab/MAmmoTH2-8B-Plus
+    OwenArli/Awanllm-Llama-3-8B-Cumulus-v1.0
+    refuelai/Llama-3-Refueled
+    SicariusSicariiStuff/LLAMA-3_8B_Unaligned_Alpha
+    NousResearch/Hermes-2-Theta-Llama-3-8B
+    ResplendentAI/Nymph_8B
+    grimjim/Llama-3-Oasis-v1-OAS-8B
+    flammenai/Mahou-1.3b-llama3-8B
+    lemon07r/Llama-3-RedMagic4-8B
+    grimjim/Llama-3.1-SuperNova-Lite-lorabilterated-8B
+    grimjim/Llama-Nephilim-Metamorphosis-v2-8B
+    lemon07r/Lllama-3-RedElixir-8B
+    grimjim/Llama-3-Perky-Pat-Instruct-8B
+    ChaoticNeutrals/Hathor_RP-v.01-L3-8B
+    grimjim/llama-3-Nephilim-v2.1-8B
+    ChaoticNeutrals/Hathor_Respawn-L3-8B-v0.8
+    migtissera/Llama-3-8B-Synthia-v3.5
+    Locutusque/Llama-3-Hercules-5.0-8B
+    WhiteRabbitNeo/Llama-3-WhiteRabbitNeo-8B-v2.0
+    VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct
+    iRyanBell/ARC1-II
+    HPAI-BSC/Llama3-Aloe-8B-Alpha
+    HaitameLaf/Llama-3-8B-StoryGenerator
+    failspy/Meta-Llama-3-8B-Instruct-abliterated-v3
+    Undi95/Llama-3-Unholy-8B
+    ajibawa-2023/Uncensored-Frank-Llama-3-8B
+    ajibawa-2023/SlimOrca-Llama-3-8B
+    ChaoticNeutrals/Templar_v1_8B
+    aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K
+    ChaoticNeutrals/Hathor_Tahsin-L3-8B-v0.9
+    Blackroot/Llama-3-Gamma-Twist
+    FPHam/L3-8B-Everything-COT
+    Blackroot/Llama-3-LongStory
+    ChaoticNeutrals/Sekhmet_Gimmel-L3.1-8B-v0.3
+    abacusai/Llama-3-Smaug-8B
+    Khetterman/CursedMatrix-8B-v9
+    ajibawa-2023/Scarlett-Llama-3-8B-v1.0
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/physics_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/electrical_engineering
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_chemistry
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_physics
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/formal_logic
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/philosophy_100
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/conceptual_physics
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/college_computer_science
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/psychology_non_masked
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/psychology
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Blackroot/Llama3-RP-Lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LimaRP-Instruct-LoRA-8B
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + nothingiisreal/llama3-8B-DWP-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/world_religions
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/high_school_european_history
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/electrical_engineering
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-8B-Abomination-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/human_sexuality
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + surya-narayanan/sociology
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Theory_of_Mind_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Smarts_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/Nimue-8B
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + vincentyandex/lora_llama3_chunked_novel_bs128
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Aura_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Azazelle/L3-Daybreak-8b-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/Luna_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + nicce/story-mixtral-8x7b-lora
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + Blackroot/Llama-3-LongStory-LORA
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/NoWarning_Llama3
+    MrRobotoAI/Unaligned-RP-Base-8b-1024k + ResplendentAI/BlueMoon_Llama3
+  overrides:
+    parameters:
+      model: Loki-v2.6-8b-1024k.Q4_K_M.gguf
+  files:
+    - filename: Loki-v2.6-8b-1024k.Q4_K_M.gguf
+      sha256: 9b15c1fee0a0e6d6ed97df3d1b6fc8f774e6e1bd388328599e731c62e0f19d81
+      uri: huggingface://QuantFactory/Loki-v2.6-8b-1024k-GGUF/Loki-v2.6-8b-1024k.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "impish_mind_8b"
+  icon: https://huggingface.co/SicariusSicariiStuff/Impish_Mind_8B/resolve/main/Images/Impish_Mind.png
+  urls:
+    - https://huggingface.co/SicariusSicariiStuff/Impish_Mind_8B
+    - https://huggingface.co/bartowski/Impish_Mind_8B-GGUF
+  description: |
+    This model was trained with new data and a new approach (compared to my other models). While it may be a bit more censored, it is expected to be significantly smarter. The data used is quite unique, and is also featuring long and complex markdown datasets.
+
+    Regarding censorship: Whether uncensoring or enforcing strict censorship, the model tends to lose some of its intelligence. The use of toxic data was kept to a minimum with this model.
+
+    Consequently, the model is likely to refuse some requests, this is easly avoidable with a basic system prompt, or assistant impersonation ("Sure thing!..."). Unlike many RP models, this one is designed to excel at general assistant tasks as well.
+  overrides:
+    parameters:
+      model: Impish_Mind_8B-Q4_K_M.gguf
+  files:
+    - filename: Impish_Mind_8B-Q4_K_M.gguf
+      sha256: 918f82bcb893c75fa2e846156df7bd3ce359464b960e32ae9171035ee14e7c51
+      uri: huggingface://bartowski/Impish_Mind_8B-GGUF/Impish_Mind_8B-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "tulu-3.1-8b-supernova-smart"
+  urls:
+    - https://huggingface.co/bunnycore/Tulu-3.1-8B-SuperNova-Smart
+    - https://huggingface.co/QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF
+  description: |
+    This model was merged using the passthrough merge method using bunnycore/Tulu-3.1-8B-SuperNova + bunnycore/Llama-3.1-8b-smart-lora as a base.
+  overrides:
+    parameters:
+      model: Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+  files:
+    - filename: Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+      sha256: 4b8ba9e64f0667199eee2dcc769f1a90aa9c7730165d42f440fdf107c7585c63
+      uri: huggingface://QuantFactory/Tulu-3.1-8B-SuperNova-Smart-GGUF/Tulu-3.1-8B-SuperNova-Smart.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "b-nimita-l3-8b-v0.02"
+  urls:
+    - https://huggingface.co/Arkana08/B-NIMITA-L3-8B-v0.02
+    - https://huggingface.co/QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF
+  description: |
+    B-NIMITA is an AI model designed to bring role-playing scenarios to life with emotional depth and rich storytelling. At its core is NIHAPPY, providing a solid narrative foundation and contextual consistency. This is enhanced by Mythorica, which adds vivid emotional arcs and expressive dialogue, and V-Blackroot, ensuring character consistency and subtle adaptability. This combination allows B-NIMITA to deliver dynamic, engaging interactions that feel natural and immersive.
+  overrides:
+    parameters:
+      model: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+  files:
+    - filename: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+      sha256: 625a54848dcd3f23bc06b639a7dfecae14142b5d177dd45acfe7724816bab4cd
+      uri: huggingface://QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF/B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "deepthought-8b-llama-v0.01-alpha"
+  urls:
+    - https://huggingface.co/ruliad/deepthought-8b-llama-v0.01-alpha
+    - https://huggingface.co/bartowski/deepthought-8b-llama-v0.01-alpha-GGUF
+  description: |
+    Deepthought-8B is a small and capable reasoning model built on LLaMA-3.1 8B, designed to make AI reasoning more transparent and controllable. Despite its relatively small size, it achieves sophisticated reasoning capabilities that rival much larger models.
+  overrides:
+    parameters:
+      model: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+  files:
+    - filename: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+      sha256: 33195ba7b898ef8b2997d095e8be42adf1d0e1f6e8291cf07e026fc8e45903fd
+      uri: huggingface://bartowski/deepthought-8b-llama-v0.01-alpha-GGUF/deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "fusechat-llama-3.1-8b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Llama-3.1-8B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+      sha256: fe58c8c9b695e36e6b0ee5e4d81ff71ea0a4f1a11fa7bb16e8d6f1b35a58dff6
+      uri: huggingface://bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF/FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -3608,6 +4313,20 @@
     - filename: Marco-o1.Q4_K_M.gguf
       sha256: 54dd9554cb54609bf0bf4b367dfba192fc982a2fc6b87a0f56fba5ea82762d0d
       uri: huggingface://QuantFactory/Marco-o1-GGUF/Marco-o1.Q4_K_M.gguf
+- !!merge <<: *qwen2
+  name: "marco-o1-uncensored"
+  urls:
+    - https://huggingface.co/thirdeyeai/marco-o1-uncensored
+    - https://huggingface.co/QuantFactory/marco-o1-uncensored-GGUF
+  description: |
+    Uncensored version of marco-o1
+  overrides:
+    parameters:
+      model: marco-o1-uncensored.Q4_K_M.gguf
+  files:
+    - filename: marco-o1-uncensored.Q4_K_M.gguf
+      sha256: ad0440270a7254098f90779744d3e5b34fe49b7baf97c819909ba9c5648cc0d9
+      uri: huggingface://QuantFactory/marco-o1-uncensored-GGUF/marco-o1-uncensored.Q4_K_M.gguf
 - &mistral03
   ## START Mistral
   url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
@@ -4016,6 +4735,89 @@
     - filename: magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
       sha256: 07e91d2c6d4e42312e65a69c54f16be467575f7a596fe052993b388e38b90d76
       uri: huggingface://mradermacher/magnum-12b-v2.5-kto-i1-GGUF/magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "chatty-harry_v3.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/66c1cc08453a7ef6c5fe657a/0KzNTEtn2kJJQsw4lQeY0.png
+  urls:
+    - https://huggingface.co/Triangle104/Chatty-Harry_V3.0
+    - https://huggingface.co/QuantFactory/Chatty-Harry_V3.0-GGUF
+  description: |
+    This model was merged using the TIES merge method using Triangle104/ChatWaifu_Magnum_V0.2 as a base.
+    The following models were included in the merge: elinas/Chronos-Gold-12B-1.0
+  overrides:
+    parameters:
+      model: Chatty-Harry_V3.0.Q4_K_M.gguf
+  files:
+    - filename: Chatty-Harry_V3.0.Q4_K_M.gguf
+      sha256: 54b63bb74498576ca77b801ed096657a93cc2f6b71d707c3605fdb394bd3e622
+      uri: huggingface://QuantFactory/Chatty-Harry_V3.0-GGUF/Chatty-Harry_V3.0.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "mn-chunky-lotus-12b"
+  icon: https://huggingface.co/FallenMerick/MN-Chunky-Lotus-12B/resolve/main/chunky-lotus.jpg
+  urls:
+    - https://huggingface.co/QuantFactory/MN-Chunky-Lotus-12B-GGUF
+  description: |
+    I had originally planned to use this model for future/further merges, but decided to go ahead and release it since it scored rather high on my local EQ Bench testing (79.58 w/ 100% parsed @ 8-bit).
+    Bear in mind that most models tend to score a bit higher on my own local tests as compared to their posted scores. Still, its the highest score I've personally seen from all the models I've tested.
+    Its a decent model, with great emotional intelligence and acceptable adherence to various character personalities. It does a good job at roleplaying despite being a bit bland at times.
+
+    Overall, I like the way it writes, but it has a few formatting issues that show up from time to time, and it has an uncommon tendency to paste walls of character feelings/intentions at the end of some outputs without any prompting. This is something I hope to correct with future iterations.
+    This is a merge of pre-trained language models created using mergekit.
+    The following models were included in the merge:
+        Epiculous/Violet_Twilight-v0.2
+        nbeerbower/mistral-nemo-gutenberg-12B-v4
+        flammenai/Mahou-1.5-mistral-nemo-12B
+  overrides:
+    parameters:
+      model: MN-Chunky-Lotus-12B.Q4_K_M.gguf
+  files:
+    - filename: MN-Chunky-Lotus-12B.Q4_K_M.gguf
+      sha256: 363defe0a769fdb715dab75517966a0a80bcdd981a610d4c759099b6c8ff143a
+      uri: huggingface://QuantFactory/MN-Chunky-Lotus-12B-GGUF/MN-Chunky-Lotus-12B.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "chronos-gold-12b-1.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/630417380907b9a115c6aa9f/3hc8zt8fzKdO3qHK1p1mW.webp
+  urls:
+    - https://huggingface.co/elinas/Chronos-Gold-12B-1.0
+    - https://huggingface.co/mradermacher/Chronos-Gold-12B-1.0-GGUF
+  description: |
+    Chronos Gold 12B 1.0 is a very unique model that applies to domain areas such as general chatbot functionatliy, roleplay, and storywriting. The model has been observed to write up to 2250 tokens in a single sequence. The model was trained at a sequence length of 16384 (16k) and will still retain the apparent 128k context length from Mistral-Nemo, though it deteriorates over time like regular Nemo does based on the RULER Test
+
+    As a result, is recommended to keep your sequence length max at 16384, or you will experience performance degredation.
+
+    The base model is mistralai/Mistral-Nemo-Base-2407 which was heavily modified to produce a more coherent model, comparable to much larger models.
+
+    Chronos Gold 12B-1.0 re-creates the uniqueness of the original Chronos with significiantly enhanced prompt adherence (following), coherence, a modern dataset, as well as supporting a majority of "character card" formats in applications like SillyTavern.
+
+    It went through an iterative and objective merge process as my previous models and was further finetuned on a dataset curated for it.
+
+    The specifics of the model will not be disclosed at the time due to dataset ownership.
+  overrides:
+    parameters:
+      model: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+  files:
+    - filename: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+      sha256: d75a6ed28781f0ea6fa6e58c0b25dfecdd160d4cab64aaf511ea156e99a1e1f3
+      uri: huggingface://mradermacher/Chronos-Gold-12B-1.0-GGUF/Chronos-Gold-12B-1.0.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "naturallm-7b-instruct"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  urls:
+    - https://huggingface.co/qingy2024/NaturalLM-7B-Instruct
+    - https://huggingface.co/bartowski/NaturalLM-7B-Instruct-GGUF
+  description: |
+    This Mistral 7B fine-tune is trained (for 150 steps) to talk like a human, not a "helpful assistant"!
+    It's also very beta right now. The dataset (qingy2024/Natural-Text-ShareGPT) can definitely be improved.
+  overrides:
+    parameters:
+      model: NaturalLM-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: NaturalLM-7B-Instruct-Q4_K_M.gguf
+      sha256: 15b2f34116f690fea35790a9392b8a2190fe25827e370d426e88a2a543f4dcee
+      uri: huggingface://bartowski/NaturalLM-7B-Instruct-GGUF/NaturalLM-7B-Instruct-Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -4771,6 +5573,54 @@
     - filename: G2-9B-Sugarquill-v0.Q4_K_M.gguf
       sha256: 790a2f1541011b2773e22aa863ef78c8662baaa7eca5875e9573007985120187
       uri: huggingface://QuantFactory/G2-9B-Sugarquill-v0-GGUF/G2-9B-Sugarquill-v0.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "volare-i1"
+  urls:
+    - https://huggingface.co/MoxoffSpA/Volare
+    - https://huggingface.co/mradermacher/Volare-i1-GGUF
+  description: |
+    Volare is an updated version of Gemma7B, specifically fine-tuned with SFT and LoRA adjustments.
+        It's trained on publicly available datasets, like SQUAD-it, and datasets we've created in-house.
+        it's designed to understand and maintain context, making it ideal for Retrieval Augmented Generation (RAG) tasks and applications requiring contextual awareness.
+    Italian dataset.
+  overrides:
+    parameters:
+      model: Volare.i1-Q4_K_M.gguf
+  files:
+    - filename: Volare.i1-Q4_K_M.gguf
+      sha256: fa8fb9d4cb19fcb44be8d53561c9e2840f45aed738de545983ebb158ebba461b
+      uri: huggingface://mradermacher/Volare-i1-GGUF/Volare.i1-Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "bggpt-gemma-2-2.6b-it-v1.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/637e1f8cf7e01589cc17bf7e/p6d0YFHjWCQ3S12jWqO1m.png
+  urls:
+    - https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
+    - https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
+  description: |
+    INSAIT introduces BgGPT-Gemma-2-2.6B-IT-v1.0, a state-of-the-art Bulgarian language model based on google/gemma-2-2b and google/gemma-2-2b-it. BgGPT-Gemma-2-2.6B-IT-v1.0 is free to use and distributed under the Gemma Terms of Use. This model was created by INSAIT, part of Sofia University St. Kliment Ohridski, in Sofia, Bulgaria.
+    The model was built on top of Google’s Gemma 2 2B open models. It was continuously pre-trained on around 100 billion tokens (85 billion in Bulgarian) using the Branch-and-Merge strategy INSAIT presented at EMNLP’24, allowing the model to gain outstanding Bulgarian cultural and linguistic capabilities while retaining its English performance. During the pre-training stage, we use various datasets, including Bulgarian web crawl data, freely available datasets such as Wikipedia, a range of specialized Bulgarian datasets sourced by the INSAIT Institute, and machine translations of popular English datasets. The model was then instruction-fine-tuned on a newly constructed Bulgarian instruction dataset created using real-world conversations. For more information check our blogpost.
+  overrides:
+    parameters:
+      model: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+  files:
+    - filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+      sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319
+      uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "fusechat-gemma-2-9b-instruct"
+  icon: "https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct/resolve/main/FuseChat-3.0.png"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+      sha256: f5aef201be68f344bebff3433af87aac6428fd227adfd7e468c8bfbcf9660ece
+      uri: huggingface://bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF/FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -7396,6 +8246,27 @@
     - filename: LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
       sha256: 5b88fb4537339996c04e4a1b6ef6a2d555c4103b6378e273ae9c6c5e77af67eb
       uri: huggingface://bartowski/LLAMA-3_8B_Unaligned_BETA-GGUF/LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "freyja-v4.95-maldv-7b-non-fiction-i1"
+  urls:
+    - https://huggingface.co/MrRobotoAI/Freyja-v4.95-maldv-7b-NON-FICTION
+    - https://huggingface.co/mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF
+  description: |
+    This model was merged using the Model Stock merge method using aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K as a base.
+    The following models were included in the merge:
+        maldv/llama-3-fantasy-writer-8b
+        maldv/badger-iota-llama-3-8b
+        maldv/badger-lambda-llama-3-8b
+        maldv/badger-mu-llama-3-8b
+        maldv/badger-kappa-llama-3-8b
+        maldv/badger-writer-llama-3-8b
+  overrides:
+    parameters:
+      model: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
+  files:
+    - filename: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
+      sha256: cdc0f4de6df2ba120835fbd25c2a0ae2af8548f46d2c40c7a018c51c3d19e0c0
+      uri: huggingface://mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF/Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
 - &chatml
   ### ChatML
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -7623,6 +8494,43 @@
     - filename: Meta-Llama-3-8B-Instruct-exp5-11-Q4_K_M.gguf
       sha256: 5dd81b8b809667d10036499affdd1461cf95af50b405cbc9f800b421a4b60e98
       uri: huggingface://DavidAU/Meta-Llama-3-Instruct-8.9B-BRAINSTORM-5x-FORM-11-GGUF/Meta-Llama-3-8B-Instruct-exp5-11-Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "rp-naughty-v1.0c-8b"
+  urls:
+    - https://huggingface.co/QuantFactory/RP-Naughty-v1.0c-8b-GGUF
+  description: |
+    This model was merged using the Model Stock merge method using aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K as a base.
+    The following models were included in the merge:
+
+        underwoods/adventure-8b
+        Khetterman/Multilingual-SaigaSuzume-8B
+        underwoods/writer-8b
+        Khetterman/Kosmos-8B-v1
+        Khetterman/CursedMatrix-8B-v9
+  overrides:
+    parameters:
+      model: RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+  files:
+    - filename: RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+      sha256: c344564d26d0c3d244d31cfeb103666eab37f9dee6678a2dbaf5bfcf4109d789
+      uri: huggingface://QuantFactory/RP-Naughty-v1.0c-8b-GGUF/RP-Naughty-v1.0c-8b.Q4_K_M.gguf
+- !!merge <<: *llama3
+  name: "bio-medical-llama-3-8b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/653f5b93cd52f288490edc83/zPMUugzfOiwTiRw88jm7T.jpeg
+  urls:
+    - https://huggingface.co/ContactDoctor/Bio-Medical-Llama-3-8B
+    - https://huggingface.co/QuantFactory/Bio-Medical-Llama-3-8B-GGUF
+  description: |
+    Bio-Medical-Llama-3-8B model is a specialized large language model designed for biomedical applications. It is finetuned from the meta-llama/Meta-Llama-3-8B-Instruct model using a custom dataset containing over 500,000 diverse entries. These entries include a mix of synthetic and manually curated data, ensuring high quality and broad coverage of biomedical topics.
+
+    The model is trained to understand and generate text related to various biomedical fields, making it a valuable tool for researchers, clinicians, and other professionals in the biomedical domain.
+  overrides:
+    parameters:
+      model: Bio-Medical-Llama-3-8B.Q4_K_M.gguf
+  files:
+    - filename: Bio-Medical-Llama-3-8B.Q4_K_M.gguf
+      sha256: 672939e0487d02c55734132c25a59f26e4deaac7cd49445a7028f2291139edcc
+      uri: huggingface://QuantFactory/Bio-Medical-Llama-3-8B-GGUF/Bio-Medical-Llama-3-8B.Q4_K_M.gguf
 - &command-R
   ### START Command-r
   url: "github:mudler/LocalAI/gallery/command-r.yaml@master"
@@ -8638,6 +9546,43 @@
   overrides:
     parameters:
       model: black-forest-labs/FLUX.1-schnell
+- name: flux.1-dev-ggml
+  license: flux-1-dev-non-commercial-license
+  url: "github:mudler/LocalAI/gallery/flux-ggml.yaml@master"
+  description: |
+    FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our blog post.
+    Key Features
+        Cutting-edge output quality, second only to our state-of-the-art model FLUX.1 [pro].
+        Competitive prompt following, matching the performance of closed source alternatives .
+        Trained using guidance distillation, making FLUX.1 [dev] more efficient.
+        Open weights to drive new scientific research, and empower artists to develop innovative workflows.
+        Generated outputs can be used for personal, scientific, and commercial purposes as described in the flux-1-dev-non-commercial-license.
+    This model is quantized with GGUF
+  urls:
+    - https://huggingface.co/black-forest-labs/FLUX.1-dev
+    - https://huggingface.co/city96/FLUX.1-dev-gguf
+  tags:
+    - text-to-image
+    - flux
+    - gpu
+    - cpu
+  icon: https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/schnell_grid.jpeg
+  overrides:
+    parameters:
+      model: flux1-dev-Q2_K.gguf
+  files:
+    - filename: "flux1-dev-Q2_K.gguf"
+      sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"
+      uri: "huggingface://city96/FLUX.1-dev-gguf/flux1-dev-Q2_K.gguf"
+    - filename: ae.safetensors
+      sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38
+      uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors
+    - filename: clip_l.safetensors
+      sha256: 660c6f5b1abae9dc498ac2d21e1347d2abdb0cf6c0c0c8576cd796491d9a6cdd
+      uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors
+    - filename: t5xxl_fp16.safetensors
+      sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
+      uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
 - &whisper
   ## Whisper
   url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master"
@@ -8806,6 +9751,10 @@
       llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings
   tags:
     - embeddings
+  overrides:
+    embeddings: true
+    parameters:
+      model: llama-3.2-1b-instruct-q4_k_m.gguf
 ## Stable Diffusion
 - url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master
   license: "BSD-3"
@@ -9423,3 +10372,22 @@
     - filename: silero-vad.onnx
       uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
       sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
+- name: "bark-cpp-small"
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  license: mit
+  urls:
+    - https://huggingface.co/suno/bark
+    - https://huggingface.co/Green-Sky/bark-ggml
+  description: |
+    Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints ready for inference.
+  tags:
+    - tts
+    - cpu
+  overrides:
+    backend: bark-cpp
+    parameters:
+      model: bark-small_weights-f16.bin
+  files:
+    - filename: bark-small_weights-f16.bin
+      uri: https://huggingface.co/Green-Sky/bark-ggml/resolve/main/bark-small_weights-f16.bin
+      sha256: de1ece17e8319537b3a7909baebbd28affab23c942d5d57e648d622af4e2feaa
diff --git a/gallery/rwkv.yaml b/gallery/rwkv.yaml
index 41dfcfad..68693799 100644
--- a/gallery/rwkv.yaml
+++ b/gallery/rwkv.yaml
@@ -16,6 +16,7 @@ config_file: |
 
     stopwords:
     - 'Assistant:'
+    - '<s>'
 
     template:
       chat: "{{.Input}}\nAssistant: "
diff --git a/go.mod b/go.mod
index 3bc625ac..e9bcf3ec 100644
--- a/go.mod
+++ b/go.mod
@@ -76,6 +76,7 @@ require (
 	cloud.google.com/go/auth v0.4.1 // indirect
 	cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
+	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.0.4 // indirect
 	github.com/fasthttp/websocket v1.5.3 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
@@ -84,8 +85,12 @@ require (
 	github.com/google/s2a-go v0.1.7 // indirect
 	github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
 	github.com/googleapis/gax-go/v2 v2.12.4 // indirect
+	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/moby/docker-image-spec v1.3.1 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/nikolalohinski/gonja/v2 v2.3.2 // indirect
 	github.com/pion/datachannel v1.5.8 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
 	github.com/pion/ice/v2 v2.3.34 // indirect
diff --git a/go.sum b/go.sum
index 11b87fa9..f1628f7a 100644
--- a/go.sum
+++ b/go.sum
@@ -140,6 +140,8 @@ github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L
 github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
 github.com/elastic/gosigar v0.12.0/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
 github.com/elastic/gosigar v0.14.3 h1:xwkKwPia+hSfg9GqrCUKYdId102m9qTJIIr7egmK/uo=
 github.com/elastic/gosigar v0.14.3/go.mod h1:iXRIGg2tLnu7LBdpqzyQfGDEidKCfWcCMS0WKyPWoMs=
@@ -268,6 +270,7 @@ github.com/google/go-containerregistry v0.19.2 h1:TannFKE1QSajsP6hPWb5oJNgKe1IKj
 github.com/google/go-containerregistry v0.19.2/go.mod h1:YCMFNQeeXeLF+dnhhWkqDItx/JSkH01j1Kis4PsjzFI=
 github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ=
 github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
 github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF8=
 github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo=
 github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
@@ -353,6 +356,8 @@ github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwA
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
 github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
 github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
@@ -474,8 +479,12 @@ github.com/moby/sys/sequential v0.5.0 h1:OPvI35Lzn9K04PBbCLW0g4LcFAJgHsvXsRyewg5
 github.com/moby/sys/sequential v0.5.0/go.mod h1:tH2cOOs5V9MlPiXcQzRC+eEyab644PWKGRYaaV5ZZlo=
 github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
 github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/mr-tron/base58 v1.1.2/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
 github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
@@ -519,6 +528,9 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq
 github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/neelance/astrewrite v0.0.0-20160511093645-99348263ae86/go.mod h1:kHJEU3ofeGjhHklVoIGuVj85JJwZ6kWPaJwCIxgnFmo=
 github.com/neelance/sourcemap v0.0.0-20151028013722-8c68805598ab/go.mod h1:Qr6/a/Q4r9LP1IltGz7tA7iOK1WonHEYhu1HRBA7ZiM=
+github.com/nikolalohinski/gonja v1.5.3 h1:GsA+EEaZDZPGJ8JtpeGN78jidhOlxeJROpqMT9fTj9c=
+github.com/nikolalohinski/gonja/v2 v2.3.2 h1:UgLFfqi7L9XfX0PEcE4eUpvGojVQL5KhBfJJaBp7ZxY=
+github.com/nikolalohinski/gonja/v2 v2.3.2/go.mod h1:1Wcc/5huTu6y36e0sOFR1XQoFlylw3c3H3L5WOz0RDg=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY=
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index b32e3745..d62f52b2 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -9,8 +9,6 @@ import (
 	"sync"
 	"time"
 
-	"github.com/mudler/LocalAI/pkg/templates"
-
 	"github.com/mudler/LocalAI/pkg/utils"
 
 	"github.com/rs/zerolog/log"
@@ -23,7 +21,6 @@ type ModelLoader struct {
 	ModelPath string
 	mu        sync.Mutex
 	models    map[string]*Model
-	templates *templates.TemplateCache
 	wd        *WatchDog
 }
 
@@ -31,7 +28,6 @@ func NewModelLoader(modelPath string) *ModelLoader {
 	nml := &ModelLoader{
 		ModelPath: modelPath,
 		models:    make(map[string]*Model),
-		templates: templates.NewTemplateCache(modelPath),
 	}
 
 	return nml
diff --git a/pkg/model/template.go b/pkg/model/template.go
deleted file mode 100644
index 3dc850cf..00000000
--- a/pkg/model/template.go
+++ /dev/null
@@ -1,52 +0,0 @@
-package model
-
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/templates"
-)
-
-// Rather than pass an interface{} to the prompt template:
-// These are the definitions of all possible variables LocalAI will currently populate for use in a prompt template file
-// Please note: Not all of these are populated on every endpoint - your template should either be tested for each endpoint you map it to, or tolerant of zero values.
-type PromptTemplateData struct {
-	SystemPrompt         string
-	SuppressSystemPrompt bool // used by chat specifically to indicate that SystemPrompt above should be _ignored_
-	Input                string
-	Instruction          string
-	Functions            []functions.Function
-	MessageIndex         int
-}
-
-type ChatMessageTemplateData struct {
-	SystemPrompt string
-	Role         string
-	RoleName     string
-	FunctionName string
-	Content      string
-	MessageIndex int
-	Function     bool
-	FunctionCall interface{}
-	LastMessage  bool
-}
-
-const (
-	ChatPromptTemplate templates.TemplateType = iota
-	ChatMessageTemplate
-	CompletionPromptTemplate
-	EditPromptTemplate
-	FunctionsPromptTemplate
-)
-
-func (ml *ModelLoader) EvaluateTemplateForPrompt(templateType templates.TemplateType, templateName string, in PromptTemplateData) (string, error) {
-	// TODO: should this check be improved?
-	if templateType == ChatMessageTemplate {
-		return "", fmt.Errorf("invalid templateType: ChatMessage")
-	}
-	return ml.templates.EvaluateTemplate(templateType, templateName, in)
-}
-
-func (ml *ModelLoader) EvaluateTemplateForChatMessage(templateName string, messageData ChatMessageTemplateData) (string, error) {
-	return ml.templates.EvaluateTemplate(ChatMessageTemplate, templateName, messageData)
-}
diff --git a/pkg/model/template_test.go b/pkg/model/template_test.go
deleted file mode 100644
index 1142ed0c..00000000
--- a/pkg/model/template_test.go
+++ /dev/null
@@ -1,197 +0,0 @@
-package model_test
-
-import (
-	. "github.com/mudler/LocalAI/pkg/model"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-const chatML = `<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-{{- if .FunctionCall }}
-<tool_call>
-{{- else if eq .RoleName "tool" }}
-<tool_response>
-{{- end }}
-{{- if .Content}}
-{{.Content }}
-{{- end }}
-{{- if .FunctionCall}}
-{{toJson .FunctionCall}}
-{{- end }}
-{{- if .FunctionCall }}
-</tool_call>
-{{- else if eq .RoleName "tool" }}
-</tool_response>
-{{- end }}<|im_end|>`
-
-const llama3 = `<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content -}}
-{{ else if .FunctionCall -}}
-{{ toJson .FunctionCall -}}
-{{ end -}}
-<|eot_id|>`
-
-var llama3TestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
-	"user": {
-		"template": llama3,
-		"expected": "<|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "user",
-			RoleName:     "user",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"assistant": {
-		"template": llama3,
-		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_call": {
-		"template": llama3,
-		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nFunction call:\n{\"function\":\"test\"}<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "",
-			FunctionCall: map[string]string{"function": "test"},
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_response": {
-		"template": llama3,
-		"expected": "<|start_header_id|>tool<|end_header_id|>\n\nFunction response:\nResponse from tool<|eot_id|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "tool",
-			RoleName:     "tool",
-			Content:      "Response from tool",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-}
-
-var chatMLTestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
-	"user": {
-		"template": chatML,
-		"expected": "<|im_start|>user\nA long time ago in a galaxy far, far away...<|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "user",
-			RoleName:     "user",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"assistant": {
-		"template": chatML,
-		"expected": "<|im_start|>assistant\nA long time ago in a galaxy far, far away...<|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "A long time ago in a galaxy far, far away...",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_call": {
-		"template": chatML,
-		"expected": "<|im_start|>assistant\n<tool_call>\n{\"function\":\"test\"}\n</tool_call><|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "assistant",
-			RoleName:     "assistant",
-			Content:      "",
-			FunctionCall: map[string]string{"function": "test"},
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-	"function_response": {
-		"template": chatML,
-		"expected": "<|im_start|>tool\n<tool_response>\nResponse from tool\n</tool_response><|im_end|>",
-		"data": ChatMessageTemplateData{
-			SystemPrompt: "",
-			Role:         "tool",
-			RoleName:     "tool",
-			Content:      "Response from tool",
-			FunctionCall: nil,
-			FunctionName: "",
-			LastMessage:  false,
-			Function:     false,
-			MessageIndex: 0,
-		},
-	},
-}
-
-var _ = Describe("Templates", func() {
-	Context("chat message ChatML", func() {
-		var modelLoader *ModelLoader
-		BeforeEach(func() {
-			modelLoader = NewModelLoader("")
-		})
-		for key := range chatMLTestMatch {
-			foo := chatMLTestMatch[key]
-			It("renders correctly `"+key+"`", func() {
-				templated, err := modelLoader.EvaluateTemplateForChatMessage(foo["template"].(string), foo["data"].(ChatMessageTemplateData))
-				Expect(err).ToNot(HaveOccurred())
-				Expect(templated).To(Equal(foo["expected"]), templated)
-			})
-		}
-	})
-	Context("chat message llama3", func() {
-		var modelLoader *ModelLoader
-		BeforeEach(func() {
-			modelLoader = NewModelLoader("")
-		})
-		for key := range llama3TestMatch {
-			foo := llama3TestMatch[key]
-			It("renders correctly `"+key+"`", func() {
-				templated, err := modelLoader.EvaluateTemplateForChatMessage(foo["template"].(string), foo["data"].(ChatMessageTemplateData))
-				Expect(err).ToNot(HaveOccurred())
-				Expect(templated).To(Equal(foo["expected"]), templated)
-			})
-		}
-	})
-})
diff --git a/pkg/templates/cache.go b/pkg/templates/cache.go
index e4801946..1efce660 100644
--- a/pkg/templates/cache.go
+++ b/pkg/templates/cache.go
@@ -11,59 +11,41 @@ import (
 	"github.com/mudler/LocalAI/pkg/utils"
 
 	"github.com/Masterminds/sprig/v3"
+
+	"github.com/nikolalohinski/gonja/v2"
+	"github.com/nikolalohinski/gonja/v2/exec"
 )
 
 // Keep this in sync with config.TemplateConfig. Is there a more idiomatic way to accomplish this in go?
 // Technically, order doesn't _really_ matter, but the count must stay in sync, see tests/integration/reflect_test.go
 type TemplateType int
 
-type TemplateCache struct {
-	mu            sync.Mutex
-	templatesPath string
-	templates     map[TemplateType]map[string]*template.Template
+type templateCache struct {
+	mu             sync.Mutex
+	templatesPath  string
+	templates      map[TemplateType]map[string]*template.Template
+	jinjaTemplates map[TemplateType]map[string]*exec.Template
 }
 
-func NewTemplateCache(templatesPath string) *TemplateCache {
-	tc := &TemplateCache{
-		templatesPath: templatesPath,
-		templates:     make(map[TemplateType]map[string]*template.Template),
+func newTemplateCache(templatesPath string) *templateCache {
+	tc := &templateCache{
+		templatesPath:  templatesPath,
+		templates:      make(map[TemplateType]map[string]*template.Template),
+		jinjaTemplates: make(map[TemplateType]map[string]*exec.Template),
 	}
 	return tc
 }
 
-func (tc *TemplateCache) initializeTemplateMapKey(tt TemplateType) {
+func (tc *templateCache) initializeTemplateMapKey(tt TemplateType) {
 	if _, ok := tc.templates[tt]; !ok {
 		tc.templates[tt] = make(map[string]*template.Template)
 	}
 }
 
-func (tc *TemplateCache) EvaluateTemplate(templateType TemplateType, templateName string, in interface{}) (string, error) {
-	tc.mu.Lock()
-	defer tc.mu.Unlock()
-
-	tc.initializeTemplateMapKey(templateType)
-	m, ok := tc.templates[templateType][templateName]
-	if !ok {
-		// return "", fmt.Errorf("template not loaded: %s", templateName)
-		loadErr := tc.loadTemplateIfExists(templateType, templateName)
-		if loadErr != nil {
-			return "", loadErr
-		}
-		m = tc.templates[templateType][templateName] // ok is not important since we check m on the next line, and wealready checked
-	}
-	if m == nil {
-		return "", fmt.Errorf("failed loading a template for %s", templateName)
-	}
-
-	var buf bytes.Buffer
-
-	if err := m.Execute(&buf, in); err != nil {
-		return "", err
-	}
-	return buf.String(), nil
+func (tc *templateCache) existsInModelPath(s string) bool {
+	return utils.ExistsInPath(tc.templatesPath, s)
 }
-
-func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templateName string) error {
+func (tc *templateCache) loadTemplateIfExists(templateType TemplateType, templateName string) error {
 
 	// Check if the template was already loaded
 	if _, ok := tc.templates[templateType][templateName]; ok {
@@ -82,6 +64,51 @@ func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templat
 		return fmt.Errorf("template file outside path: %s", file)
 	}
 
+	// can either be a file in the system or a string with the template
+	if tc.existsInModelPath(modelTemplateFile) {
+		d, err := os.ReadFile(file)
+		if err != nil {
+			return err
+		}
+		dat = string(d)
+	} else {
+		dat = templateName
+	}
+
+	// Parse the template
+	tmpl, err := template.New("prompt").Funcs(sprig.FuncMap()).Parse(dat)
+	if err != nil {
+		return err
+	}
+	tc.templates[templateType][templateName] = tmpl
+
+	return nil
+}
+
+func (tc *templateCache) initializeJinjaTemplateMapKey(tt TemplateType) {
+	if _, ok := tc.jinjaTemplates[tt]; !ok {
+		tc.jinjaTemplates[tt] = make(map[string]*exec.Template)
+	}
+}
+
+func (tc *templateCache) loadJinjaTemplateIfExists(templateType TemplateType, templateName string) error {
+	// Check if the template was already loaded
+	if _, ok := tc.jinjaTemplates[templateType][templateName]; ok {
+		return nil
+	}
+
+	// Check if the model path exists
+	// skip any error here - we run anyway if a template does not exist
+	modelTemplateFile := fmt.Sprintf("%s.tmpl", templateName)
+
+	dat := ""
+	file := filepath.Join(tc.templatesPath, modelTemplateFile)
+
+	// Security check
+	if err := utils.VerifyPath(modelTemplateFile, tc.templatesPath); err != nil {
+		return fmt.Errorf("template file outside path: %s", file)
+	}
+
 	// can either be a file in the system or a string with the template
 	if utils.ExistsInPath(tc.templatesPath, modelTemplateFile) {
 		d, err := os.ReadFile(file)
@@ -93,12 +120,65 @@ func (tc *TemplateCache) loadTemplateIfExists(templateType TemplateType, templat
 		dat = templateName
 	}
 
-	// Parse the template
-	tmpl, err := template.New("prompt").Funcs(sprig.FuncMap()).Parse(dat)
+	tmpl, err := gonja.FromString(dat)
 	if err != nil {
 		return err
 	}
-	tc.templates[templateType][templateName] = tmpl
+	tc.jinjaTemplates[templateType][templateName] = tmpl
 
 	return nil
 }
+
+func (tc *templateCache) evaluateJinjaTemplate(templateType TemplateType, templateNameOrContent string, in map[string]interface{}) (string, error) {
+	tc.mu.Lock()
+	defer tc.mu.Unlock()
+
+	tc.initializeJinjaTemplateMapKey(templateType)
+	m, ok := tc.jinjaTemplates[templateType][templateNameOrContent]
+	if !ok {
+		// return "", fmt.Errorf("template not loaded: %s", templateName)
+		loadErr := tc.loadJinjaTemplateIfExists(templateType, templateNameOrContent)
+		if loadErr != nil {
+			return "", loadErr
+		}
+		m = tc.jinjaTemplates[templateType][templateNameOrContent] // ok is not important since we check m on the next line, and wealready checked
+	}
+	if m == nil {
+		return "", fmt.Errorf("failed loading a template for %s", templateNameOrContent)
+	}
+
+	var buf bytes.Buffer
+
+	data := exec.NewContext(in)
+
+	if err := m.Execute(&buf, data); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
+
+func (tc *templateCache) evaluateTemplate(templateType TemplateType, templateNameOrContent string, in interface{}) (string, error) {
+	tc.mu.Lock()
+	defer tc.mu.Unlock()
+
+	tc.initializeTemplateMapKey(templateType)
+	m, ok := tc.templates[templateType][templateNameOrContent]
+	if !ok {
+		// return "", fmt.Errorf("template not loaded: %s", templateName)
+		loadErr := tc.loadTemplateIfExists(templateType, templateNameOrContent)
+		if loadErr != nil {
+			return "", loadErr
+		}
+		m = tc.templates[templateType][templateNameOrContent] // ok is not important since we check m on the next line, and wealready checked
+	}
+	if m == nil {
+		return "", fmt.Errorf("failed loading a template for %s", templateNameOrContent)
+	}
+
+	var buf bytes.Buffer
+
+	if err := m.Execute(&buf, in); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}
diff --git a/pkg/templates/cache_test.go b/pkg/templates/cache_test.go
deleted file mode 100644
index 8bb50766..00000000
--- a/pkg/templates/cache_test.go
+++ /dev/null
@@ -1,73 +0,0 @@
-package templates_test
-
-import (
-	"os"
-	"path/filepath"
-
-	"github.com/mudler/LocalAI/pkg/templates" // Update with your module path
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("TemplateCache", func() {
-	var (
-		templateCache *templates.TemplateCache
-		tempDir       string
-	)
-
-	BeforeEach(func() {
-		var err error
-		tempDir, err = os.MkdirTemp("", "templates")
-		Expect(err).NotTo(HaveOccurred())
-
-		// Writing example template files
-		err = os.WriteFile(filepath.Join(tempDir, "example.tmpl"), []byte("Hello, {{.Name}}!"), 0600)
-		Expect(err).NotTo(HaveOccurred())
-		err = os.WriteFile(filepath.Join(tempDir, "empty.tmpl"), []byte(""), 0600)
-		Expect(err).NotTo(HaveOccurred())
-
-		templateCache = templates.NewTemplateCache(tempDir)
-	})
-
-	AfterEach(func() {
-		os.RemoveAll(tempDir) // Clean up
-	})
-
-	Describe("EvaluateTemplate", func() {
-		Context("when template is loaded successfully", func() {
-			It("should evaluate the template correctly", func() {
-				result, err := templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-				Expect(err).NotTo(HaveOccurred())
-				Expect(result).To(Equal("Hello, Gopher!"))
-			})
-		})
-
-		Context("when template isn't a file", func() {
-			It("should parse from string", func() {
-				result, err := templateCache.EvaluateTemplate(1, "{{.Name}}", map[string]string{"Name": "Gopher"})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(result).To(Equal("Gopher"))
-			})
-		})
-
-		Context("when template is empty", func() {
-			It("should return an empty string", func() {
-				result, err := templateCache.EvaluateTemplate(1, "empty", nil)
-				Expect(err).NotTo(HaveOccurred())
-				Expect(result).To(Equal(""))
-			})
-		})
-	})
-
-	Describe("concurrency", func() {
-		It("should handle multiple concurrent accesses", func(done Done) {
-			go func() {
-				_, _ = templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-			}()
-			go func() {
-				_, _ = templateCache.EvaluateTemplate(1, "example", map[string]string{"Name": "Gopher"})
-			}()
-			close(done)
-		}, 0.1) // timeout in seconds
-	})
-})
diff --git a/pkg/templates/evaluator.go b/pkg/templates/evaluator.go
new file mode 100644
index 00000000..aedf7b41
--- /dev/null
+++ b/pkg/templates/evaluator.go
@@ -0,0 +1,295 @@
+package templates
+
+import (
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/rs/zerolog/log"
+)
+
+// Rather than pass an interface{} to the prompt template:
+// These are the definitions of all possible variables LocalAI will currently populate for use in a prompt template file
+// Please note: Not all of these are populated on every endpoint - your template should either be tested for each endpoint you map it to, or tolerant of zero values.
+type PromptTemplateData struct {
+	SystemPrompt         string
+	SuppressSystemPrompt bool // used by chat specifically to indicate that SystemPrompt above should be _ignored_
+	Input                string
+	Instruction          string
+	Functions            []functions.Function
+	MessageIndex         int
+}
+
+type ChatMessageTemplateData struct {
+	SystemPrompt string
+	Role         string
+	RoleName     string
+	FunctionName string
+	Content      string
+	MessageIndex int
+	Function     bool
+	FunctionCall interface{}
+	LastMessage  bool
+}
+
+const (
+	ChatPromptTemplate TemplateType = iota
+	ChatMessageTemplate
+	CompletionPromptTemplate
+	EditPromptTemplate
+	FunctionsPromptTemplate
+)
+
+type Evaluator struct {
+	cache *templateCache
+}
+
+func NewEvaluator(modelPath string) *Evaluator {
+	return &Evaluator{
+		cache: newTemplateCache(modelPath),
+	}
+}
+
+func (e *Evaluator) EvaluateTemplateForPrompt(templateType TemplateType, config config.BackendConfig, in PromptTemplateData) (string, error) {
+	template := ""
+
+	// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+	if e.cache.existsInModelPath(fmt.Sprintf("%s.tmpl", config.Model)) {
+		template = config.Model
+	}
+
+	switch templateType {
+	case CompletionPromptTemplate:
+		if config.TemplateConfig.Completion != "" {
+			template = config.TemplateConfig.Completion
+		}
+	case EditPromptTemplate:
+		if config.TemplateConfig.Edit != "" {
+			template = config.TemplateConfig.Edit
+		}
+	case ChatPromptTemplate:
+		if config.TemplateConfig.Chat != "" {
+			template = config.TemplateConfig.Chat
+		}
+	case FunctionsPromptTemplate:
+		if config.TemplateConfig.Functions != "" {
+			template = config.TemplateConfig.Functions
+		}
+	}
+
+	if template == "" {
+		return in.Input, nil
+	}
+
+	if config.TemplateConfig.JinjaTemplate {
+		return e.evaluateJinjaTemplateForPrompt(templateType, template, in)
+	}
+
+	return e.cache.evaluateTemplate(templateType, template, in)
+}
+
+func (e *Evaluator) evaluateTemplateForChatMessage(templateName string, messageData ChatMessageTemplateData) (string, error) {
+	return e.cache.evaluateTemplate(ChatMessageTemplate, templateName, messageData)
+}
+
+func (e *Evaluator) templateJinjaChat(templateName string, messageData []ChatMessageTemplateData, funcs []functions.Function) (string, error) {
+
+	conversation := make(map[string]interface{})
+	messages := make([]map[string]interface{}, len(messageData))
+
+	// convert from ChatMessageTemplateData to what the jinja template expects
+
+	for _, message := range messageData {
+		// TODO: this seems to cover minimum text templates. Can be expanded to cover more complex interactions
+		var data []byte
+		data, _ = json.Marshal(message.FunctionCall)
+		messages = append(messages, map[string]interface{}{
+			"role":      message.RoleName,
+			"content":   message.Content,
+			"tool_call": string(data),
+		})
+	}
+
+	conversation["messages"] = messages
+
+	// if tools are detected, add these
+	if len(funcs) > 0 {
+		conversation["tools"] = funcs
+	}
+
+	return e.cache.evaluateJinjaTemplate(ChatMessageTemplate, templateName, conversation)
+}
+
+func (e *Evaluator) evaluateJinjaTemplateForPrompt(templateType TemplateType, templateName string, in PromptTemplateData) (string, error) {
+
+	conversation := make(map[string]interface{})
+
+	conversation["system_prompt"] = in.SystemPrompt
+	conversation["content"] = in.Input
+
+	return e.cache.evaluateJinjaTemplate(templateType, templateName, conversation)
+}
+
+func (e *Evaluator) TemplateMessages(messages []schema.Message, config *config.BackendConfig, funcs []functions.Function, shouldUseFn bool) string {
+
+	if config.TemplateConfig.JinjaTemplate {
+		var messageData []ChatMessageTemplateData
+		for messageIndex, i := range messages {
+			fcall := i.FunctionCall
+			if len(i.ToolCalls) > 0 {
+				fcall = i.ToolCalls
+			}
+			messageData = append(messageData, ChatMessageTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Role:         config.Roles[i.Role],
+				RoleName:     i.Role,
+				Content:      i.StringContent,
+				FunctionCall: fcall,
+				FunctionName: i.Name,
+				LastMessage:  messageIndex == (len(messages) - 1),
+				Function:     config.Grammar != "" && (messageIndex == (len(messages) - 1)),
+				MessageIndex: messageIndex,
+			})
+		}
+
+		templatedInput, err := e.templateJinjaChat(config.TemplateConfig.ChatMessage, messageData, funcs)
+		if err == nil {
+			return templatedInput
+		}
+	}
+
+	var predInput string
+	suppressConfigSystemPrompt := false
+	mess := []string{}
+	for messageIndex, i := range messages {
+		var content string
+		role := i.Role
+
+		// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+		// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+		if (i.FunctionCall != nil || i.ToolCalls != nil) && i.Role == "assistant" {
+			roleFn := "assistant_function_call"
+			r := config.Roles[roleFn]
+			if r != "" {
+				role = roleFn
+			}
+		}
+		r := config.Roles[role]
+		contentExists := i.Content != nil && i.StringContent != ""
+
+		fcall := i.FunctionCall
+		if len(i.ToolCalls) > 0 {
+			fcall = i.ToolCalls
+		}
+
+		// First attempt to populate content via a chat message specific template
+		if config.TemplateConfig.ChatMessage != "" {
+			chatMessageData := ChatMessageTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Role:         r,
+				RoleName:     role,
+				Content:      i.StringContent,
+				FunctionCall: fcall,
+				FunctionName: i.Name,
+				LastMessage:  messageIndex == (len(messages) - 1),
+				Function:     config.Grammar != "" && (messageIndex == (len(messages) - 1)),
+				MessageIndex: messageIndex,
+			}
+			templatedChatMessage, err := e.evaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+			if err != nil {
+				log.Error().Err(err).Interface("message", chatMessageData).Str("template", config.TemplateConfig.ChatMessage).Msg("error processing message with template, skipping")
+			} else {
+				if templatedChatMessage == "" {
+					log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
+					continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+				}
+				log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+				content = templatedChatMessage
+			}
+		}
+
+		marshalAnyRole := func(f any) {
+			j, err := json.Marshal(f)
+			if err == nil {
+				if contentExists {
+					content += "\n" + fmt.Sprint(r, " ", string(j))
+				} else {
+					content = fmt.Sprint(r, " ", string(j))
+				}
+			}
+		}
+		marshalAny := func(f any) {
+			j, err := json.Marshal(f)
+			if err == nil {
+				if contentExists {
+					content += "\n" + string(j)
+				} else {
+					content = string(j)
+				}
+			}
+		}
+		// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+		if content == "" {
+			if r != "" {
+				if contentExists {
+					content = fmt.Sprint(r, i.StringContent)
+				}
+
+				if i.FunctionCall != nil {
+					marshalAnyRole(i.FunctionCall)
+				}
+				if i.ToolCalls != nil {
+					marshalAnyRole(i.ToolCalls)
+				}
+			} else {
+				if contentExists {
+					content = fmt.Sprint(i.StringContent)
+				}
+				if i.FunctionCall != nil {
+					marshalAny(i.FunctionCall)
+				}
+				if i.ToolCalls != nil {
+					marshalAny(i.ToolCalls)
+				}
+			}
+			// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+			if contentExists && role == "system" {
+				suppressConfigSystemPrompt = true
+			}
+		}
+
+		mess = append(mess, content)
+	}
+
+	joinCharacter := "\n"
+	if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+		joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+	}
+
+	predInput = strings.Join(mess, joinCharacter)
+	log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+	promptTemplate := ChatPromptTemplate
+
+	if config.TemplateConfig.Functions != "" && shouldUseFn {
+		promptTemplate = FunctionsPromptTemplate
+	}
+
+	templatedInput, err := e.EvaluateTemplateForPrompt(promptTemplate, *config, PromptTemplateData{
+		SystemPrompt:         config.SystemPrompt,
+		SuppressSystemPrompt: suppressConfigSystemPrompt,
+		Input:                predInput,
+		Functions:            funcs,
+	})
+	if err == nil {
+		predInput = templatedInput
+		log.Debug().Msgf("Template found, input modified to: %s", predInput)
+	} else {
+		log.Debug().Msgf("Template failed loading: %s", err.Error())
+	}
+
+	return predInput
+}
diff --git a/pkg/templates/evaluator_test.go b/pkg/templates/evaluator_test.go
new file mode 100644
index 00000000..b58dd40b
--- /dev/null
+++ b/pkg/templates/evaluator_test.go
@@ -0,0 +1,253 @@
+package templates_test
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/functions"
+	. "github.com/mudler/LocalAI/pkg/templates"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+const toolCallJinja = `{{ '<|begin_of_text|>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>
+
+' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>
+
+' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}`
+
+const chatML = `<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+{{- if .FunctionCall }}
+<tool_call>
+{{- else if eq .RoleName "tool" }}
+<tool_response>
+{{- end }}
+{{- if .Content}}
+{{.Content }}
+{{- end }}
+{{- if .FunctionCall}}
+{{toJson .FunctionCall}}
+{{- end }}
+{{- if .FunctionCall }}
+</tool_call>
+{{- else if eq .RoleName "tool" }}
+</tool_response>
+{{- end }}<|im_end|>`
+
+const llama3 = `<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content -}}
+{{ else if .FunctionCall -}}
+{{ toJson .FunctionCall -}}
+{{ end -}}
+<|eot_id|>`
+
+var llama3TestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+	"assistant": {
+		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "assistant",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_call": {
+
+		"expected": "<|start_header_id|>assistant<|end_header_id|>\n\nFunction call:\n{\"function\":\"test\"}<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:         "assistant",
+				FunctionCall: map[string]string{"function": "test"},
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_response": {
+		"expected": "<|start_header_id|>tool<|end_header_id|>\n\nFunction response:\nResponse from tool<|eot_id|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: llama3,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "tool",
+				StringContent: "Response from tool",
+			},
+		},
+		"shouldUseFn": false,
+	},
+}
+
+var chatMLTestMatch map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|im_start|>user\nA long time ago in a galaxy far, far away...<|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+	"assistant": {
+		"expected": "<|im_start|>assistant\nA long time ago in a galaxy far, far away...<|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions": []functions.Function{},
+		"messages": []schema.Message{
+			{
+				Role:          "assistant",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+		"shouldUseFn": false,
+	},
+	"function_call": {
+		"expected": "<|im_start|>assistant\n<tool_call>\n{\"function\":\"test\"}\n</tool_call><|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions": []functions.Function{
+			{
+				Name:        "test",
+				Description: "test",
+				Parameters:  nil,
+			},
+		},
+		"shouldUseFn": true,
+		"messages": []schema.Message{
+			{
+				Role:         "assistant",
+				FunctionCall: map[string]string{"function": "test"},
+			},
+		},
+	},
+	"function_response": {
+		"expected": "<|im_start|>tool\n<tool_response>\nResponse from tool\n</tool_response><|im_end|>",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage: chatML,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "tool",
+				StringContent: "Response from tool",
+			},
+		},
+	},
+}
+
+var jinjaTest map[string]map[string]interface{} = map[string]map[string]interface{}{
+	"user": {
+		"expected": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA long time ago in a galaxy far, far away...<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+		"config": &config.BackendConfig{
+			TemplateConfig: config.TemplateConfig{
+				ChatMessage:   toolCallJinja,
+				JinjaTemplate: true,
+			},
+		},
+		"functions":   []functions.Function{},
+		"shouldUseFn": false,
+		"messages": []schema.Message{
+			{
+				Role:          "user",
+				StringContent: "A long time ago in a galaxy far, far away...",
+			},
+		},
+	},
+}
+var _ = Describe("Templates", func() {
+	Context("chat message ChatML", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range chatMLTestMatch {
+			foo := chatMLTestMatch[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+	Context("chat message llama3", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range llama3TestMatch {
+			foo := llama3TestMatch[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+	Context("chat message jinja", func() {
+		var evaluator *Evaluator
+		BeforeEach(func() {
+			evaluator = NewEvaluator("")
+		})
+		for key := range jinjaTest {
+			foo := jinjaTest[key]
+			It("renders correctly `"+key+"`", func() {
+				templated := evaluator.TemplateMessages(foo["messages"].([]schema.Message), foo["config"].(*config.BackendConfig), foo["functions"].([]functions.Function), foo["shouldUseFn"].(bool))
+				Expect(templated).To(Equal(foo["expected"]), templated)
+			})
+		}
+	})
+})
diff --git a/tests/models_fixtures/rwkv.yaml b/tests/models_fixtures/rwkv.yaml
index bf54394f..f66cfe21 100644
--- a/tests/models_fixtures/rwkv.yaml
+++ b/tests/models_fixtures/rwkv.yaml
@@ -14,6 +14,7 @@ roles:
 
 stopwords:
 - 'Assistant:'
+- '<s>'
 
 template:
   chat: |