Merge branch 'master' into lumina

2025-06-25 12:15:00 +00:00 · 2025-04-09 23:11:07 +02:00 · 2025-04-09 23:11:07 +02:00 · 505013ce66
commit 505013ce66
parent 97ccef0222 a69e30e0c9
43 changed files with 1169 additions and 388 deletions
--- a/.env
+++ b/.env
@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true

+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.3
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/6
+++ b/6
@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=5dec47dcd411fdf815a3708fd6194e2b13d19006
+CPPLLAMA_VERSION?=b32efad2bc42460637c3a364c9554ea8217b3d7f

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@ -809,7 +809,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -817,7 +817,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@ -50,7 +50,23 @@

 **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](https://github.com/user-attachments/assets/9841b1ee-88af-4b96-8ec0-41b17364efa7) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](https://github.com/user-attachments/assets/d729f6f4-0621-4715-bda3-35fe6e159524) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](https://github.com/user-attachments/assets/3cf0b918-ba8e-498a-a3cd-485db5984325) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](https://github.com/user-attachments/assets/6753d23d-218b-4e07-94b8-9e6c5a4f2311) |
+
+| Chat Interface | API Overview |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](https://github.com/user-attachments/assets/048eab31-0f0c-4d52-a920-3715233f9bf3) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](https://github.com/user-attachments/assets/2540e8ce-1a2c-4c12-800c-763bd9be247f) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](https://github.com/user-attachments/assets/5af681b0-dd8e-4fe8-a234-a22f8a040547) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](https://github.com/user-attachments/assets/b9527176-63d6-4d2e-8ed1-7fde13a9b0ad) |
+
+## Quickstart

 Run the installer script:

@ -92,6 +108,8 @@ local-ai run oci://localai/phi-2:latest

 ## 📰 Latest project news

+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server

 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
 endif

 llama.cpp:
@ -73,8 +80,8 @@ grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)"
 else
-	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@ -509,15 +509,15 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
+            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
                /* use_gpu */ has_gpu,
-                /*verbosity=*/ 1,
+                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
            });
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
                return false;
            }

@ -531,7 +531,7 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERR("unable to load model: %s", params.model.path.c_str());
            return false;
        }

@ -2122,7 +2122,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }

 std::function<void(int)> shutdown_handler;
-inline void signal_handler(int signal) { shutdown_handler(signal); }
+
+inline void signal_handler(int signal) {
+    exit(1);
+}
+

 /////////////////////////////////
 ////////////////////////////////
@ -2322,11 +2326,11 @@ static void params_parse(const backend::ModelOptions* request,
   
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809

-    params.model = request->modelfile();
+    params.model.path = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@ -2401,7 +2405,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@ -2649,6 +2653,20 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");

+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -21,6 +21,7 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
--- a/core/application/application.go
+++ b/core/application/application.go
@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}

-	if options.LoadToMemory != nil {
+	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

-	if so.SingleBackend {
-		defOpts = append(defOpts, model.WithSingleActiveBackend())
-	}
-
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word:    t.Word,
+			Word: t.Word,
 		})

 	}
@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:               c.MMProj,
-		FlashAttention:       c.FlashAttention,
-		CacheTypeKey:         c.CacheTypeK,
-		CacheTypeValue:       c.CacheTypeV,
-		NoKVOffload:          c.NoKVOffloading,
-		YarnExtFactor:        c.YarnExtFactor,
-		YarnAttnFactor:       c.YarnAttnFactor,
-		YarnBetaFast:         c.YarnBetaFast,
-		YarnBetaSlow:         c.YarnBetaSlow,
-		NGQA:                 c.NGQA,
-		RMSNormEps:           c.RMSNormEps,
-		MLock:                mmlock,
-		RopeFreqBase:         c.RopeFreqBase,
-		RopeScaling:          c.RopeScaling,
-		Type:                 c.ModelType,
-		RopeFreqScale:        c.RopeFreqScale,
-		NUMA:                 c.NUMA,
-		Embeddings:           embeddings,
-		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(nGPULayers),
-		MMap:                 mmap,
-		MainGPU:              c.MainGPU,
-		Threads:              int32(*c.Threads),
-		TensorSplit:          c.TensorSplit,
+		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:              c.MMProj,
+		FlashAttention:      c.FlashAttention,
+		CacheTypeKey:        c.CacheTypeK,
+		CacheTypeValue:      c.CacheTypeV,
+		NoKVOffload:         c.NoKVOffloading,
+		YarnExtFactor:       c.YarnExtFactor,
+		YarnAttnFactor:      c.YarnAttnFactor,
+		YarnBetaFast:        c.YarnBetaFast,
+		YarnBetaSlow:        c.YarnBetaSlow,
+		NGQA:                c.NGQA,
+		RMSNormEps:          c.RMSNormEps,
+		MLock:               mmlock,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeScaling:         c.RopeScaling,
+		Type:                c.ModelType,
+		RopeFreqScale:       c.RopeFreqScale,
+		NUMA:                c.NUMA,
+		Embeddings:          embeddings,
+		LowVRAM:             lowVRAM,
+		NGPULayers:          int32(nGPULayers),
+		MMap:                mmap,
+		MainGPU:             c.MainGPU,
+		Threads:             int32(*c.Threads),
+		TensorSplit:         c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@ -26,10 +26,10 @@ func SoundGeneration(

 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@ -20,6 +20,7 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()

 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac

 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
-
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
+	defer loader.Close()

 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()

 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()

 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()
+
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -38,7 +38,7 @@ type RunCMD struct {

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`

 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}

 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -389,16 +389,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}

-	// Value passed by the top level are treated as default (no implicit defaults)
-	// defaults are set by the user
-	if ctx == 0 {
-		ctx = 1024
-	}
-
-	if cfg.ContextSize == nil {
-		cfg.ContextSize = &ctx
-	}
-
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@ -420,7 +410,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}

-	guessDefaultsFromFile(cfg, lo.modelPath)
+	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
 }

 func (c *BackendConfig) Validate() bool {
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@ -0,0 +1,253 @@
+package config
+
+import (
+	"strings"
+
+	"github.com/rs/zerolog/log"
+
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+	DeepSeek2
+)
+
+const (
+	defaultContextSize = 1024
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+	RepeatPenalty  float64
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		RepeatPenalty: 1.0,
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<start_of_turn>model\n",
+			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
+
+	if defaultCtx == 0 && cfg.ContextSize == nil {
+		ctxSize := f.EstimateLLaMACppUsage().ContextSize
+		if ctxSize > 0 {
+			cSize := int(ctxSize)
+			cfg.ContextSize = &cSize
+		} else {
+			defaultCtx = defaultContextSize
+			cfg.ContextSize = &defaultCtx
+		}
+	}
+
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
+		}
+		if cfg.RepeatPenalty == 0.0 {
+			cfg.RepeatPenalty = settings.RepeatPenalty
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	deepseek2 := arch == "deepseek2"
+
+	switch {
+	case deepseek2:
+		return DeepSeek2
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
+	}
+}
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@ -3,147 +3,12 @@ package config
 import (
 	"os"
 	"path/filepath"
-	"strings"

 	"github.com/rs/zerolog/log"
-
 	gguf "github.com/thxcode/gguf-parser-go"
 )

-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
-
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 		return
 	}

-	if cfg.HasTemplate() {
-		// nothing to guess here
-		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
-		return
-	}
-
 	// We try to guess only if we don't have a template defined already
 	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+
+	// try to parse the gguf file
 	f, err := gguf.ParseGGUFFile(guessPath)
-	if err != nil {
-		// Only valid for gguf files
-		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+	if err == nil {
+		guessGGUFFromFile(cfg, f, defaultCtx)
 		return
 	}

-	log.Debug().
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
-		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
-
-	// guess the name
-	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
-	}
-
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
+	if cfg.ContextSize == nil {
+		if defaultCtx == 0 {
+			defaultCtx = defaultContextSize
 		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
+		cfg.ContextSize = &defaultCtx
 	}
 }
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
+			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl max-h-full",
+				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
+					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
 				},
 				// header
 				elem.Div(
@ -164,14 +164,13 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4",
+						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
-							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@ -232,7 +231,6 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
-
 }

 func modelDescription(m *gallery.GalleryModel) elem.Node {
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
+		defer sl.Close()

 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath)
+	var ml = model.NewModelLoader(modelPath, false)

 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)

 	// Stores
-	sl := model.NewModelLoader("")
-	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))

 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
--- a/docs/data/version.json
+++ b/docs/data/version.json
@ -1,3 +1,3 @@
 {
-  "version": "v2.26.0"
+  "version": "v2.27.0"
 }
--- a/gallery/gemma.yaml
+++ b/gallery/gemma.yaml
@ -8,9 +8,7 @@ config_file: |
    chat_message: |-
      <start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
      {{ if .FunctionCall -}}
-      Function call:
      {{ else if eq .RoleName "tool" -}}
-      Function response:
      {{ end -}}
      {{ if .Content -}}
      {{.Content -}}
@ -25,11 +23,14 @@ config_file: |
      {{.Input}}
    function: |
      <start_of_turn>system
-      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+      You have access to functions. If you decide to invoke any of the function(s),
+      you MUST put it in the format of
+      {"name": function name, "parameters": dictionary of argument name and its value}
+
+      You SHOULD NOT include any other text in the response if you call a function
      {{range .Functions}}
      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
      {{end}}
-      For each function call return a json object with function name and arguments
      <end_of_turn>
      {{.Input -}}
      <start_of_turn>model
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@ -78,6 +78,60 @@
    - filename: gemma-3-1b-it-Q4_K_M.gguf
      sha256: 8ccc5cd1f1b3602548715ae25a66ed73fd5dc68a210412eea643eb20eb75a135
      uri: huggingface://ggml-org/gemma-3-1b-it-GGUF/gemma-3-1b-it-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-12b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-12b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 12B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-12b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-12b-it-q4_0.gguf
+      sha256: 6f1bb5f455414f7b46482bda51cbfdbf19786e21a5498c4403fdfc03d09b045c
+      uri: huggingface://vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf/gemma-3-12b-it-q4_0.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-4b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-4b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 4B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-4b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-4b-it-q4_0.gguf
+      sha256: 2ca493d426ffcb43db27132f183a0230eda4a3621e58b328d55b665f1937a317
+      uri: huggingface://vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf/gemma-3-4b-it-q4_0.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-27b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-27b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 27B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-27b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-27b-it-q4_0.gguf
+      sha256: 45e586879bc5f5d7a5b6527e812952057ce916d9fc7ba16f7262ec9972c9e2a2
+      uri: huggingface://vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf/gemma-3-27b-it-q4_0.gguf
 - !!merge <<: *gemma3
  name: "qgallouedec_gemma-3-27b-it-codeforces-sft"
  urls:
@ -386,6 +440,61 @@
    - filename: Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
      sha256: 4c35a678e3784e20a8d85d4e7045d965509a1a71305a0da105fc5991ba7d6dc4
      uri: huggingface://mradermacher/Gemma-3-Starshine-12B-i1-GGUF/Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "burtenshaw_gemmacoder3-12b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/62d648291fa3e4e7ae3fa6e8/zkcBr2UZFDpALAsMdgbze.gif
+  urls:
+    - https://huggingface.co/burtenshaw/GemmaCoder3-12B
+    - https://huggingface.co/bartowski/burtenshaw_GemmaCoder3-12B-GGUF
+  description: |
+    This model is a fine-tuned version of google/gemma-3-12b-it on the open-r1/codeforces-cots dataset. It has been trained using TRL.
+  overrides:
+    parameters:
+      model: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+  files:
+    - filename: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+      sha256: 47f0a2848eeed783cb03336afd8cc69f6ee0e088e3cec11ab6d9fe16457dc3d4
+      uri: huggingface://bartowski/burtenshaw_GemmaCoder3-12B-GGUF/burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "tesslate_synthia-s1-27b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/zgFDl7UvWhiPYqdote7XT.png
+  urls:
+    - https://huggingface.co/Tesslate/Synthia-S1-27b
+    - https://huggingface.co/bartowski/Tesslate_Synthia-S1-27b-GGUF
+  description: |
+    Synthia-S1-27b is a reasoning, AI model developed by Tesslate AI, fine-tuned specifically for advanced reasoning, coding, and RP usecases. Built upon the robust Gemma3 architecture, Synthia-S1-27b excels in logical reasoning, creative writing, and deep contextual understanding. It supports multimodal inputs (text and images) with a large 128K token context window, enabling complex analysis suitable for research, academic tasks, and enterprise-grade AI applications.
+  overrides:
+    parameters:
+      model: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+      sha256: d953bf7f802dc68f85a35360deb24b9a8b446af051e82c77f2f0759065d2aa71
+      uri: huggingface://bartowski/Tesslate_Synthia-S1-27b-GGUF/Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+- &llama4
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://avatars.githubusercontent.com/u/153379578
+  license: llama4
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - llama3.3
+  name: "meta-llama_llama-4-scout-17b-16e-instruct"
+  urls:
+    - https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+    - https://huggingface.co/bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF
+  description: |
+    The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding.
+
+    These Llama 4 models mark the beginning of a new era for the Llama ecosystem. We are launching two efficient models in the Llama 4 series, Llama 4 Scout, a 17 billion parameter model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts.
+  overrides:
+    parameters:
+      model: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
+  files:
+    - filename: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
+      sha256: 48dfc18d40691b4190b7fecf1f89b78cadc758c3a27a9e2a1cabd686fdb822e3
+      uri: huggingface://bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF/meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
 - &eurollm
  name: "eurollm-9b-instruct"
  icon: https://openeurollm.eu/_next/static/media/logo-dark.e7001867.svg
@ -1315,6 +1424,119 @@
    - filename: Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
      sha256: 817073c85286c25a9373f330aad32b503e6c13d626a3fbee926d96a7ab866845
      uri: huggingface://bartowski/Sao10K_Llama-3.3-70B-Vulpecula-r1-GGUF/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "tarek07_legion-v2.1-llama-70b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64909c086073a0cd172d0411/mqajIk-EsgQ0ZVAZJ4trP.png
+  urls:
+    - https://huggingface.co/Tarek07/Legion-V2.1-LLaMa-70B
+    - https://huggingface.co/bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF
+  description: |
+    My biggest merge yet, consisting of a total of 20 specially curated models. My methodology in approaching this was to create 5 highly specialized models:
+
+    A completely uncensored base A very intelligent model based on UGI, Willingness and NatInt scores on the UGI Leaderboard A highly descriptive writing model, specializing in creative and natural prose A RP model specially merged with fine-tuned models that use a lot of RP datasets The secret ingredient: A completely unhinged, uncensored final model
+
+    These five models went through a series of iterations until I got something I thought worked well and then combined them to make LEGION.
+
+    The full list of models used in this merge is below:
+
+        TheDrummer/Fallen-Llama-3.3-R1-70B-v1
+        Sao10K/Llama-3.3-70B-Vulpecula-r1
+        Sao10K/L3-70B-Euryale-v2.1
+        SicariusSicariiStuff/Negative_LLAMA_70B
+        allura-org/Bigger-Body-70b
+        Sao10K/70B-L3.3-mhnnn-x1
+        Sao10K/L3.3-70B-Euryale-v2.3
+        Doctor-Shotgun/L3.3-70B-Magnum-v4-SE
+        Sao10K/L3.1-70B-Hanami-x1
+        Sao10K/70B-L3.3-Cirrus-x1
+        EVA-UNIT-01/EVA-LLaMA-3.33-70B-v0.1
+        TheDrummer/Anubis-70B-v1
+        ArliAI/Llama-3.3-70B-ArliAI-RPMax-v1.4
+        LatitudeGames/Wayfarer-Large-70B-Llama-3.3
+        NeverSleep/Lumimaid-v0.2-70B
+        mlabonne/Hermes-3-Llama-3.1-70B-lorablated
+        ReadyArt/Forgotten-Safeword-70B-3.6
+        ReadyArt/Fallen-Abomination-70B-R1-v4.1
+        ReadyArt/Fallen-Safeword-70B-R1-v4.1
+        huihui-ai/Llama-3.3-70B-Instruct-abliterated
+  overrides:
+    parameters:
+      model: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+  files:
+    - filename: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+      sha256: 8f3de344ba83705f9491c2ed354fd6131a05946fca2eceae495b6fc67a7dbe7a
+      uri: huggingface://bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF/Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "forgotten-abomination-70b-v5.0"
+  icon: https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0/resolve/main/waifu5.webp
+  urls:
+    - https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0
+    - https://huggingface.co/mradermacher/Forgotten-Abomination-70B-v5.0-GGUF
+  description: |
+    The Unholy Union of Safeword and Nevoria
+    This model represents the forbidden merger of:
+
+    🧬 Forgotten-Safeword-70B-v5.0 - Industrial-grade depravity matrix with 23 distinct taboo specializations
+    ⚗️ L3.3-MS-Nevoria-70b - SteelSkull's masterpiece of storytelling and character adherence, free of positivity bias and ethical constraints
+    Expect:
+
+    🔥 Unparalleled erotic roleplaying with the least Llama refusals you've ever seen
+    📖 Novel-quality prose that follows your character card with precision
+    🧠 Handles complex multi-character scenarios effortlessly
+    💀 Will gleefully explore any taboo subject without hesitation
+  overrides:
+    parameters:
+      model: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+  files:
+    - filename: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+      sha256: a5f5e712e66b855f36ff45175f20c24441fa942ca8af47bd6f49107c6e0f025d
+      uri: huggingface://mradermacher/Forgotten-Abomination-70B-v5.0-GGUF/Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "watt-ai_watt-tool-70b"
+  urls:
+    - https://huggingface.co/watt-ai/watt-tool-70B
+    - https://huggingface.co/bartowski/watt-ai_watt-tool-70B-GGUF
+  description: |
+    watt-tool-70B is a fine-tuned language model based on LLaMa-3.3-70B-Instruct, optimized for tool usage and multi-turn dialogue. It achieves state-of-the-art performance on the Berkeley Function-Calling Leaderboard (BFCL).
+    Model Description
+
+    This model is specifically designed to excel at complex tool usage scenarios that require multi-turn interactions, making it ideal for empowering platforms like Lupan, an AI-powered workflow building tool. By leveraging a carefully curated and optimized dataset, watt-tool-70B demonstrates superior capabilities in understanding user requests, selecting appropriate tools, and effectively utilizing them across multiple turns of conversation.
+
+    Target Application: AI Workflow Building as in https://lupan.watt.chat/ and Coze.
+    Key Features
+
+        Enhanced Tool Usage: Fine-tuned for precise and efficient tool selection and execution.
+        Multi-Turn Dialogue: Optimized for maintaining context and effectively utilizing tools across multiple turns of conversation, enabling more complex task completion.
+        State-of-the-Art Performance: Achieves top performance on the BFCL, demonstrating its capabilities in function calling and tool usage.
+        Based on LLaMa-3.1-70B-Instruct: Inherits the strong language understanding and generation capabilities of the base model.
+  overrides:
+    parameters:
+      model: watt-ai_watt-tool-70B-Q4_K_M.gguf
+  files:
+    - filename: watt-ai_watt-tool-70B-Q4_K_M.gguf
+      sha256: 93806a5482b9e40e50ffca7a72abe3414d384749cc9e3d378eab5db8a8154b18
+      uri: huggingface://bartowski/watt-ai_watt-tool-70B-GGUF/watt-ai_watt-tool-70B-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "deepcogito_cogito-v1-preview-llama-70b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF
+  description: |
+      The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+          Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+          The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+          The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+              In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+          Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
+      sha256: d1deaf80c649e2a9446463cf5e1f7c026583647f46e3940d2b405a57cc685225
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF/deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
 - &rwkv
  url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
  name: "rwkv-6-world-7b"
@ -2428,6 +2650,27 @@
    - filename: Eximius_Persona_5B.Q4_K_M.gguf
      sha256: 8a8e7a0fa1068755322c51900e53423d795e57976b4d95982242cbec41141c7b
      uri: huggingface://mradermacher/Eximius_Persona_5B-GGUF/Eximius_Persona_5B.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "deepcogito_cogito-v1-preview-llama-3b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
+      sha256: 726a0ef5f818b8d238f2844f3204848bea66fb9c172b8ae0f6dc51b7bc081dd5
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF/deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
 - &qwen25
  name: "qwen2.5-14b-instruct" ## Qwen2.5
  icon: https://avatars.githubusercontent.com/u/141221163
@ -5219,6 +5462,401 @@
    - filename: Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
      sha256: 11b2eb96a8a4d512fceb3344dccc694972801c964cf748d723fdf436bc368915
      uri: huggingface://mradermacher/Qwen2.5-14B-Instruct-1M-Unalign-i1-GGUF/Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-32b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-32B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-32B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-32B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+      sha256: e52a2a0a877ce1de78f2ea472c9e3bc7a0c20d6998423e9d99a59175809d3a22
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-32B-GGUF/Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-14b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-14B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-14B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-14B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+      sha256: 1b35ff651b9c1e4538d10e3117390ae36094b6455a9f937a4f3ab72162125bca
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-14B-GGUF/Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-7B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-7B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-7B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+      sha256: 7968332d01b5479dee99aff7c9764b9e61c2a6d2828c266163596dd783bdee18
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-7B-GGUF/Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-3b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-3B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-3B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-3B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+      sha256: d6b9d31d78d36094cab2725a7df318f8f3556990df736a21998c952d9a6ee0bf
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-3B-GGUF/Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "chaoticneutrals_very_berry_qwen2_7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/626dfb8786671a29c715f8a9/1J817kx3zZccf5yvQYiGM.png
+  urls:
+    - https://huggingface.co/ChaoticNeutrals/Very_Berry_Qwen2_7B
+    - https://huggingface.co/bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF
+  description: |
+    It do the stuff.
+  overrides:
+    parameters:
+      model: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+  files:
+    - filename: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+      sha256: cbda41c638c23a3e8e9fb33c27ca0d0a0ee044b6813941a0017fd46369a35ec5
+      uri: huggingface://bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF/ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "galactic-qwen-14b-exp1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/SjM3y5Qcr2RX6zC3GQxR3.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Galactic-Qwen-14B-Exp1
+    - https://huggingface.co/mradermacher/Galactic-Qwen-14B-Exp1-GGUF
+  description: |
+    Galactic-Qwen-14B-Exp1 is based on the Qwen 2.5 14B modality architecture, designed to enhance the reasoning capabilities of 14B-parameter models. This model is optimized for general-purpose reasoning and answering, excelling in contextual understanding, logical deduction, and multi-step problem-solving. It has been fine-tuned using a long chain-of-thought reasoning model and specialized datasets to improve comprehension, structured responses, and conversational intelligence.
+  overrides:
+    parameters:
+      model: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+  files:
+    - filename: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+      sha256: 26e99578c341c879cc2676c4c7a45b6c0d00b30bd17c8ee7494fcc4092480ef0
+      uri: huggingface://mradermacher/Galactic-Qwen-14B-Exp1-GGUF/Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "hammer2.0-7b"
+  urls:
+    - https://huggingface.co/MadeAgents/Hammer2.0-7b
+    - https://huggingface.co/Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF
+  description: |
+    Hammer2.0 finetuned based on Qwen 2.5 series and Qwen 2.5 coder series using function masking techniques. It's trained using the APIGen Function Calling Datasets containing 60,000 samples, supplemented by xlam-irrelevance-7.5k we generated. Hammer2.0 has achieved exceptional performances across numerous function calling benchmarks. For more details, please refer to Hammer: Robust Function-Calling for On-Device Language Models via Function Masking and Hammer GitHub repository .
+  overrides:
+    parameters:
+      model: hammer2.0-7b-q5_k_m.gguf
+  files:
+    - filename: hammer2.0-7b-q5_k_m.gguf
+      sha256: 3682843c857595765f0786cf24b3d501af96fe5d99a9fb2526bc7707e28bae1e
+      uri: huggingface://Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF/hammer2.0-7b-q5_k_m.gguf
+- !!merge <<: *qwen25
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  name: "all-hands_openhands-lm-32b-v0.1"
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-32b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-32b-v0.1-GGUF
+  description: |
+    Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+      sha256: f7c2311d3264cc1e021a21a319748a9c75b74ddebe38551786aa4053448e5e74
+      uri: huggingface://bartowski/all-hands_openhands-lm-32b-v0.1-GGUF/all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "all-hands_openhands-lm-7b-v0.1"
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-7b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-7b-v0.1-GGUF
+  description: |
+    This is a smaller 7B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+      sha256: d50031b04bbdad714c004a0dc117c18d26a026297c236cda36089c20279b2ec1
+      uri: huggingface://bartowski/all-hands_openhands-lm-7b-v0.1-GGUF/all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "all-hands_openhands-lm-1.5b-v0.1"
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-1.5b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF
+  description: |
+    This is a smaller 1.5B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. It is intended to be used for speculative decoding. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+      sha256: 30abd7860c4eb5f2f51546389407b0064360862f64ea55cdf95f97c6e155b3c6
+      uri: huggingface://bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF/all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-7b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-7B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-7B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+      sha256: 6fd603511076ffea3697c8a76d82c054781c5e11f134b937a66cedfc49b3d2c5
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-7B-GGUF/katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-1.5b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-1.5B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+      sha256: 5bfcb72803745c374a90b0ceb60f347a8c7d1239960cce6a2d22cc1276236098
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF/katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-3b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-3B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-3B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+      sha256: f59dbef397bf1364b5f0a2c23a7f67c40ec63cc666036c4e7615fa7d79d4e1a0
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-3B-GGUF/katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "open-thoughts_openthinker2-32b"
+  icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
+  urls:
+    - https://huggingface.co/open-thoughts/OpenThinker2-32B
+    - https://huggingface.co/bartowski/open-thoughts_OpenThinker2-32B-GGUF
+  description: |
+    This model is a fine-tuned version of Qwen/Qwen2.5-32B-Instruct on the OpenThoughts2-1M dataset.
+
+    The OpenThinker2-32B model is the highest performing open-data model. This model improves upon our previous OpenThinker-32B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
+  overrides:
+    parameters:
+      model: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+  files:
+    - filename: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+      sha256: e9c7bf7cb349cfe07b4550759a3b4d7005834d0fa7580b23e483cbfeecd7a982
+      uri: huggingface://bartowski/open-thoughts_OpenThinker2-32B-GGUF/open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "open-thoughts_openthinker2-7b"
+  icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.pnghttps://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
+  urls:
+    - https://huggingface.co/open-thoughts/OpenThinker2-7B
+    - https://huggingface.co/bartowski/open-thoughts_OpenThinker2-7B-GGUF
+  description: |
+    This model is a fine-tuned version of Qwen/Qwen2.5-7B-Instruct on the OpenThoughts2-1M dataset.
+
+    The OpenThinker2-7B model is the top 7B open-data reasoning model. It delivers performance comparable to state of the art 7B models like DeepSeek-R1-Distill-7B across a suite of tasks. This model improves upon our previous OpenThinker-7B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
+  overrides:
+    parameters:
+      model: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+  files:
+    - filename: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+      sha256: 481d785047d66ae2eeaf14650a9e659ec4f7766a6414b6c7e92854c944201734
+      uri: huggingface://bartowski/open-thoughts_OpenThinker2-7B-GGUF/open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "arliai_qwq-32b-arliai-rpr-v1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6625f4a8a8d1362ebcc3851a/albSlnUy9dPVGVuLlsBua.jpeg
+  urls:
+    - https://huggingface.co/ArliAI/QwQ-32B-ArliAI-RpR-v1
+    - https://huggingface.co/bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF
+  description: |
+    RpR (RolePlay with Reasoning) is a new series of models from ArliAI. This series builds directly upon the successful dataset curation methodology and training methods developed for the RPMax series.
+
+    RpR models use the same curated, deduplicated RP and creative writing dataset used for RPMax, with a focus on variety to ensure high creativity and minimize cross-context repetition. Users familiar with RPMax will recognize the unique, non-repetitive writing style unlike other finetuned-for-RP models.
+
+    With the release of QwQ as the first high performing open-source reasoning model that can be easily trained, it was clear that the available instruct and creative writing reasoning datasets contains only one response per example. This is type of single response dataset used for training reasoning models causes degraded output quality in long multi-turn chats. Which is why Arli AI decided to create a real RP model capable of long multi-turn chat with reasoning.
+
+    In order to create RpR, we first had to actually create the reasoning RP dataset by re-processing our existing known-good RPMax dataset into a reasoning dataset. This was possible by using the base QwQ Instruct model itself to create the reasoning process for every turn in the RPMax dataset conversation examples, which is then further refined in order to make sure the reasoning is in-line with the actual response examples from the dataset.
+
+    Another important thing to get right is to make sure the model is trained on examples that present reasoning blocks in the same way as it encounters it during inference. Which is, never seeing the reasoning blocks in it's context. In order to do this, the training run was completed using axolotl with manual template-free segments dataset in order to make sure that the model is never trained to see the reasoning block in the context. Just like how the model will be used during inference time.
+
+    The result of training QwQ on this dataset with this method are consistently coherent and interesting outputs even in long multi-turn RP chats. This is as far as we know the first true correctly-trained reasoning model trained for RP and creative writing.
+  overrides:
+    parameters:
+      model: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+  files:
+    - filename: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+      sha256: b0f2ca8f62a5d021e20db40608a109713e9d23e75b68b3b71b7654c04d596dcf
+      uri: huggingface://bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF/ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "mensa-beta-14b-instruct-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/DyO5Fvqwvee-UM9QqgWZS.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
+    - https://huggingface.co/mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF
+  description: |
+    weighted/imatrix quants of https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
+  overrides:
+    parameters:
+      model: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+  files:
+    - filename: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+      sha256: 86ccd640d72dcf3129fdd5b94381a733a684672b22487784e388b2ee9de57760
+      uri: huggingface://mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF/Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "cogito-v1-preview-qwen-14B"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B
+    - https://huggingface.co/NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF
+  description: |
+      The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+      Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+      The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+      The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+          In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+      Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: cogito-v1-preview-qwen-14b-q4_k_m.gguf
+  files:
+    - filename: cogito-v1-preview-qwen-14b-q4_k_m.gguf
+      sha256: 42ddd667bac3e5f0989f52b3dca5767ed15d0e5077c6f537e4b3873862ff7096
+      uri: huggingface://NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF/cogito-v1-preview-qwen-14b-q4_k_m.gguf
+- !!merge <<: *qwen25
+  name: "deepcogito_cogito-v1-preview-qwen-32b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
+      sha256: 985f2d49330090e64603309f7eb61030769f25a5da027ac0b0a740858d087ad8
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF/deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
 - &llama31
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
  icon: https://avatars.githubusercontent.com/u/153379578
@ -7357,6 +7995,27 @@
    - filename: TextSynth-8B.i1-Q4_K_M.gguf
      sha256: 9186a8cb3a797cd2cd5b2eeaee99808674d96731824a9ee45685bbf480ba56c3
      uri: huggingface://mradermacher/TextSynth-8B-i1-GGUF/TextSynth-8B.i1-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "deepcogito_cogito-v1-preview-llama-8b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
+      sha256: 445173fb1dacef3fa0be49ebb4512b948fdb1434d86732de198424695b017b50
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF/deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
 - !!merge <<: *llama33
  name: "llama-3.3-magicalgirl-2.5-i1"
  icon: https://cdn-uploads.huggingface.co/production/uploads/633e85093a17ab61de8d9073/FGK0qBGmELj6DEUxbbrdR.png
@ -7835,6 +8494,20 @@
    - filename: Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
      sha256: aed6bd5bb03b7bd886939237bc10ea6331d4feb5a3b6712e0c5474a778acf817
      uri: huggingface://mradermacher/Fallen-Safeword-70B-R1-v4.1-GGUF/Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
+- !!merge <<: *deepseek-r1
+  name: "agentica-org_deepcoder-14b-preview"
+  urls:
+    - https://huggingface.co/agentica-org/DeepCoder-14B-Preview
+    - https://huggingface.co/bartowski/agentica-org_DeepCoder-14B-Preview-GGUF
+  description: |
+    DeepCoder-14B-Preview is a code reasoning LLM fine-tuned from DeepSeek-R1-Distilled-Qwen-14B using distributed reinforcement learning (RL) to scale up to long context lengths. The model achieves 60.6% Pass@1 accuracy on LiveCodeBench v5 (8/1/24-2/1/25), representing a 8% improvement over the base model (53%) and achieving similar performance to OpenAI's o3-mini with just 14B parameters.
+  overrides:
+    parameters:
+      model: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
+  files:
+    - filename: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
+      sha256: 38f0f777de3116ca27d10ec84388b3290a1bf3f7db8c5bdc1f92d100e4231870
+      uri: huggingface://bartowski/agentica-org_DeepCoder-14B-Preview-GGUF/agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
 - &qwen2
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ## Start QWEN2
  name: "qwen2-7b-instruct"
@ -9149,6 +9822,21 @@
    - filename: BlackSheep-24B.i1-Q4_K_M.gguf
      sha256: 95ae096eca05a95591254babf81b4d5617ceebbe8eda04c6cf8968ef4a69fc80
      uri: huggingface://mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.i1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "eurydice-24b-v2-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/652c2a63d78452c4742cd3d3/Hm_tg4s0D6yWmtrTHII32.png
+  urls:
+    - https://huggingface.co/aixonlab/Eurydice-24b-v2
+    - https://huggingface.co/mradermacher/Eurydice-24b-v2-i1-GGUF
+  description: |
+    Eurydice 24b v2 is designed to be the perfect companion for multi-role conversations. It demonstrates exceptional contextual understanding and excels in creativity, natural conversation and storytelling. Built on Mistral 3.1, this model has been trained on a custom dataset specifically crafted to enhance its capabilities.
+  overrides:
+    parameters:
+      model: Eurydice-24b-v2.i1-Q4_K_M.gguf
+  files:
+    - filename: Eurydice-24b-v2.i1-Q4_K_M.gguf
+      sha256: fb4104a1b33dd860e1eca3b6906a10cacc5b91a2534db72d9749652a204fbcbf
+      uri: huggingface://mradermacher/Eurydice-24b-v2-i1-GGUF/Eurydice-24b-v2.i1-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
--- a/main.go
+++ b/main.go
@ -74,10 +74,9 @@ Version: ${version}
 		),
 		kong.UsageOnError(),
 		kong.Vars{
-			"basepath":         kong.ExpandPath("."),
-			"remoteLibraryURL": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml",
-			"galleries":        `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
-			"version":          internal.PrintableVersion(),
+			"basepath":  kong.ExpandPath("."),
+			"galleries": `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
+			"version":   internal.PrintableVersion(),
 		},
 	)

--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -473,8 +473,6 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 		backend = realBackend
 	}

-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
-
 	var backendToConsume string

 	switch backend {
@ -497,17 +495,37 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 }

 func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
+	if !singleActiveBackend {
+		return
+	}
+
 	// If we can have only one backend active, kill all the others (except external backends)
-	if singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
-		err := ml.StopGRPC(allExcept(modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
-		}
+
+	// Stop all backends except the one we are going to load
+	log.Debug().Msgf("Stopping all backends except '%s'", modelID)
+	err := ml.StopGRPC(allExcept(modelID))
+	if err != nil {
+		log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
 	}
 }

+func (ml *ModelLoader) Close() {
+	if !ml.singletonMode {
+		return
+	}
+	ml.singletonLock.Unlock()
+}
+
+func (ml *ModelLoader) lockBackend() {
+	if !ml.singletonMode {
+		return
+	}
+	ml.singletonLock.Lock()
+}
+
 func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
+	ml.lockBackend() // grab the singleton lock if needed
+
 	o := NewOptions(opts...)

 	// Return earlier if we have a model already loaded
@ -518,17 +536,20 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}

-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
+	ml.stopActiveBackends(o.modelID, ml.singletonMode)

+	// if a backend is defined, return the loader directly
 	if o.backendString != "" {
 		return ml.backendLoader(opts...)
 	}

+	// Otherwise scan for backends in the asset directory
 	var err error

 	// get backends embedded in the binary
 	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
 	if err != nil {
+		ml.Close() // we failed, release the lock
 		return nil, err
 	}

@ -560,5 +581,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		}
 	}

+	ml.Close() // make sure to release the lock in case of failure
+
 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
 }
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@ -18,16 +18,19 @@ import (

 // TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we seperate directories for .bin/.yaml and .tmpl
 type ModelLoader struct {
-	ModelPath string
-	mu        sync.Mutex
-	models    map[string]*Model
-	wd        *WatchDog
+	ModelPath     string
+	mu            sync.Mutex
+	singletonLock sync.Mutex
+	singletonMode bool
+	models        map[string]*Model
+	wd            *WatchDog
 }

-func NewModelLoader(modelPath string) *ModelLoader {
+func NewModelLoader(modelPath string, singleActiveBackend bool) *ModelLoader {
 	nml := &ModelLoader{
-		ModelPath: modelPath,
-		models:    make(map[string]*Model),
+		ModelPath:     modelPath,
+		models:        make(map[string]*Model),
+		singletonMode: singleActiveBackend,
 	}

 	return nml
@ -142,26 +145,6 @@ func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string,
 func (ml *ModelLoader) ShutdownModel(modelName string) error {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
-	model, ok := ml.models[modelName]
-	if !ok {
-		return fmt.Errorf("model %s not found", modelName)
-	}
-
-	retries := 1
-	for model.GRPC(false, ml.wd).IsBusy() {
-		log.Debug().Msgf("%s busy. Waiting.", modelName)
-		dur := time.Duration(retries*2) * time.Second
-		if dur > retryTimeout {
-			dur = retryTimeout
-		}
-		time.Sleep(dur)
-		retries++
-
-		if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
-			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
-			break
-		}
-	}

 	return ml.deleteProcess(modelName)
 }
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go
@ -17,10 +17,9 @@ type Options struct {

 	externalBackends map[string]string

-	grpcAttempts        int
-	grpcAttemptsDelay   int
-	singleActiveBackend bool
-	parallelRequests    bool
+	grpcAttempts      int
+	grpcAttemptsDelay int
+	parallelRequests  bool
 }

 type Option func(*Options)
@ -88,12 +87,6 @@ func WithContext(ctx context.Context) Option {
 	}
 }

-func WithSingleActiveBackend() Option {
-	return func(o *Options) {
-		o.singleActiveBackend = true
-	}
-}
-
 func WithModelID(id string) Option {
 	return func(o *Options) {
 		o.modelID = id
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@ -21,7 +21,7 @@ var _ = Describe("ModelLoader", func() {
 		// Setup the model loader with a test directory
 		modelPath = "/tmp/test_model_path"
 		os.Mkdir(modelPath, 0755)
-		modelLoader = model.NewModelLoader(modelPath)
+		modelLoader = model.NewModelLoader(modelPath, false)
 	})

 	AfterEach(func() {
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@ -9,25 +9,43 @@ import (
 	"strconv"
 	"strings"
 	"syscall"
+	"time"

 	"github.com/hpcloud/tail"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )

+var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
+
 func (ml *ModelLoader) deleteProcess(s string) error {
+	model, ok := ml.models[s]
+	if !ok {
+		log.Debug().Msgf("Model %s not found", s)
+		return fmt.Errorf("model %s not found", s)
+	}
+
 	defer delete(ml.models, s)

+	retries := 1
+	for model.GRPC(false, ml.wd).IsBusy() {
+		log.Debug().Msgf("%s busy. Waiting.", s)
+		dur := time.Duration(retries*2) * time.Second
+		if dur > retryTimeout {
+			dur = retryTimeout
+		}
+		time.Sleep(dur)
+		retries++
+
+		if retries > 10 && forceBackendShutdown {
+			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", s, retries)
+			break
+		}
+	}
+
 	log.Debug().Msgf("Deleting process %s", s)

-	m, exists := ml.models[s]
-	if !exists {
-		log.Error().Msgf("Model does not exist %s", s)
-		// Nothing to do
-		return nil
-	}
-
-	process := m.Process()
+	process := model.Process()
 	if process == nil {
 		log.Error().Msgf("No process for %s", s)
 		// Nothing to do as there is no process
@ -44,9 +62,12 @@ func (ml *ModelLoader) deleteProcess(s string) error {

 func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 	var err error = nil
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
 	for k, m := range ml.models {
 		if filter(k, m.Process()) {
-			e := ml.ShutdownModel(k)
+			e := ml.deleteProcess(k)
 			err = errors.Join(err, e)
 		}
 	}
--- a/tests/integration/stores_test.go
+++ b/tests/integration/stores_test.go
@ -70,7 +70,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				model.WithModel("test"),
 			}

-			sl = model.NewModelLoader("")
+			sl = model.NewModelLoader("", false)
 			sc, err = sl.Load(storeOpts...)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(sc).ToNot(BeNil())
@ -235,7 +235,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {-1.0, 0.0, 0.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}

-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())

 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@ -247,7 +247,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 1.0}, {0.0, 2.0, 0.0}, {0.0, 0.0, -1.0}, {-1.0, 0.0, -1.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}

-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())

 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@ -314,7 +314,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"

 			normalize(keys[6:])

-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())

 			expectTriangleEq(keys, vals)
@ -341,7 +341,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				c += 1
 			}

-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())

 			expectTriangleEq(keys, vals)