From c965197d6fe21125bd03f2409733f59afe1105bc Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 28 Mar 2025 22:37:18 +0100
Subject: [PATCH 01/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `b4ae50810e4304d052e630784c14bde7e79e4132` (#5085)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index ac164186..397f2e63 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=5dec47dcd411fdf815a3708fd6194e2b13d19006
+CPPLLAMA_VERSION?=b4ae50810e4304d052e630784c14bde7e79e4132
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 22d5727089c0725d68c921b4d87c9c6529c66cf4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 11:27:06 +0100
Subject: [PATCH 02/59] chore(model gallery): add tarek07_legion-v2.1-llama-70b
 (#5087)

---
 gallery/index.yaml | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e4c57fa2..c60a946a 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1315,6 +1315,48 @@
     - filename: Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
       sha256: 817073c85286c25a9373f330aad32b503e6c13d626a3fbee926d96a7ab866845
       uri: huggingface://bartowski/Sao10K_Llama-3.3-70B-Vulpecula-r1-GGUF/Sao10K_Llama-3.3-70B-Vulpecula-r1-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "tarek07_legion-v2.1-llama-70b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64909c086073a0cd172d0411/mqajIk-EsgQ0ZVAZJ4trP.png
+  urls:
+    - https://huggingface.co/Tarek07/Legion-V2.1-LLaMa-70B
+    - https://huggingface.co/bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF
+  description: |
+    My biggest merge yet, consisting of a total of 20 specially curated models. My methodology in approaching this was to create 5 highly specialized models:
+
+    A completely uncensored base A very intelligent model based on UGI, Willingness and NatInt scores on the UGI Leaderboard A highly descriptive writing model, specializing in creative and natural prose A RP model specially merged with fine-tuned models that use a lot of RP datasets The secret ingredient: A completely unhinged, uncensored final model
+
+    These five models went through a series of iterations until I got something I thought worked well and then combined them to make LEGION.
+
+    The full list of models used in this merge is below:
+
+        TheDrummer/Fallen-Llama-3.3-R1-70B-v1
+        Sao10K/Llama-3.3-70B-Vulpecula-r1
+        Sao10K/L3-70B-Euryale-v2.1
+        SicariusSicariiStuff/Negative_LLAMA_70B
+        allura-org/Bigger-Body-70b
+        Sao10K/70B-L3.3-mhnnn-x1
+        Sao10K/L3.3-70B-Euryale-v2.3
+        Doctor-Shotgun/L3.3-70B-Magnum-v4-SE
+        Sao10K/L3.1-70B-Hanami-x1
+        Sao10K/70B-L3.3-Cirrus-x1
+        EVA-UNIT-01/EVA-LLaMA-3.33-70B-v0.1
+        TheDrummer/Anubis-70B-v1
+        ArliAI/Llama-3.3-70B-ArliAI-RPMax-v1.4
+        LatitudeGames/Wayfarer-Large-70B-Llama-3.3
+        NeverSleep/Lumimaid-v0.2-70B
+        mlabonne/Hermes-3-Llama-3.1-70B-lorablated
+        ReadyArt/Forgotten-Safeword-70B-3.6
+        ReadyArt/Fallen-Abomination-70B-R1-v4.1
+        ReadyArt/Fallen-Safeword-70B-R1-v4.1
+        huihui-ai/Llama-3.3-70B-Instruct-abliterated
+  overrides:
+    parameters:
+      model: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+  files:
+    - filename: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+      sha256: 8f3de344ba83705f9491c2ed354fd6131a05946fca2eceae495b6fc67a7dbe7a
+      uri: huggingface://bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF/Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From 5d261a6fcd474a6a0e699e2e3a9861f42056353f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 11:53:47 +0100
Subject: [PATCH 03/59] chore(model gallery): add tesslate_tessa-t1-32b (#5088)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index c60a946a..8d325252 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5261,6 +5261,26 @@
     - filename: Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
       sha256: 11b2eb96a8a4d512fceb3344dccc694972801c964cf748d723fdf436bc368915
       uri: huggingface://mradermacher/Qwen2.5-14B-Instruct-1M-Unalign-i1-GGUF/Qwen2.5-14B-Instruct-1M-Unalign.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-32b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-32B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-32B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-32B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+      sha256: e52a2a0a877ce1de78f2ea472c9e3bc7a0c20d6998423e9d99a59175809d3a22
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-32B-GGUF/Tesslate_Tessa-T1-32B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 8faf39d34ee8dc117c34018b81983c7f63c4599e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 11:58:39 +0100
Subject: [PATCH 04/59] chore(model gallery): add tesslate_tessa-t1-14b (#5090)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 8d325252..866fc577 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5281,6 +5281,26 @@
     - filename: Tesslate_Tessa-T1-32B-Q4_K_M.gguf
       sha256: e52a2a0a877ce1de78f2ea472c9e3bc7a0c20d6998423e9d99a59175809d3a22
       uri: huggingface://bartowski/Tesslate_Tessa-T1-32B-GGUF/Tesslate_Tessa-T1-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-14b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-14B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-14B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-14B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+      sha256: 1b35ff651b9c1e4538d10e3117390ae36094b6455a9f937a4f3ab72162125bca
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-14B-GGUF/Tesslate_Tessa-T1-14B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 699519d1feca42e34f17ac49d3c5d7fdee187347 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 12:12:01 +0100
Subject: [PATCH 05/59] chore(model gallery): add tesslate_tessa-t1-7b (#5091)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 866fc577..e1b07470 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5301,6 +5301,26 @@
     - filename: Tesslate_Tessa-T1-14B-Q4_K_M.gguf
       sha256: 1b35ff651b9c1e4538d10e3117390ae36094b6455a9f937a4f3ab72162125bca
       uri: huggingface://bartowski/Tesslate_Tessa-T1-14B-GGUF/Tesslate_Tessa-T1-14B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-7B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-7B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-7B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+      sha256: 7968332d01b5479dee99aff7c9764b9e61c2a6d2828c266163596dd783bdee18
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-7B-GGUF/Tesslate_Tessa-T1-7B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 77d7dc62c42f80c6c958d43d28104d4494cecec3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 12:15:28 +0100
Subject: [PATCH 06/59] chore(model gallery): add tesslate_tessa-t1-3b (#5092)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index e1b07470..ad52f97b 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5321,6 +5321,26 @@
     - filename: Tesslate_Tessa-T1-7B-Q4_K_M.gguf
       sha256: 7968332d01b5479dee99aff7c9764b9e61c2a6d2828c266163596dd783bdee18
       uri: huggingface://bartowski/Tesslate_Tessa-T1-7B-GGUF/Tesslate_Tessa-T1-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tesslate_tessa-t1-3b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/I7XzH-NMKUshcGU86u6VA.png
+  urls:
+    - https://huggingface.co/Tesslate/Tessa-T1-3B
+    - https://huggingface.co/bartowski/Tesslate_Tessa-T1-3B-GGUF
+  description: |
+   Tessa-T1 is an innovative transformer-based React reasoning model, fine-tuned from the powerful Qwen2.5-Coder-3B-Instruct base model. Designed specifically for React frontend development, Tessa-T1 leverages advanced reasoning to autonomously generate well-structured, semantic React components. Its integration into agent systems makes it a powerful tool for automating web interface development and frontend code intelligence.
+    Model Highlights
+
+        React-specific Reasoning: Accurately generates functional and semantic React components.
+        Agent Integration: Seamlessly fits into AI-driven coding agents and autonomous frontend systems.
+        Context-Aware Generation: Effectively understands and utilizes UI context to provide relevant code solutions.
+  overrides:
+    parameters:
+      model: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+      sha256: d6b9d31d78d36094cab2725a7df318f8f3556990df736a21998c952d9a6ee0bf
+      uri: huggingface://bartowski/Tesslate_Tessa-T1-3B-GGUF/Tesslate_Tessa-T1-3B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 679ee7bea4a2f23d1822f631babf1c378e5191c9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 12:34:49 +0100
Subject: [PATCH 07/59] chore(model gallery): add
 chaoticneutrals_very_berry_qwen2_7b (#5093)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index ad52f97b..f881b0d8 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5341,6 +5341,21 @@
     - filename: Tesslate_Tessa-T1-3B-Q4_K_M.gguf
       sha256: d6b9d31d78d36094cab2725a7df318f8f3556990df736a21998c952d9a6ee0bf
       uri: huggingface://bartowski/Tesslate_Tessa-T1-3B-GGUF/Tesslate_Tessa-T1-3B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "chaoticneutrals_very_berry_qwen2_7b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/626dfb8786671a29c715f8a9/1J817kx3zZccf5yvQYiGM.png
+  urls:
+    - https://huggingface.co/ChaoticNeutrals/Very_Berry_Qwen2_7B
+    - https://huggingface.co/bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF
+  description: |
+    It do the stuff.
+  overrides:
+    parameters:
+      model: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+  files:
+    - filename: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+      sha256: cbda41c638c23a3e8e9fb33c27ca0d0a0ee044b6813941a0017fd46369a35ec5
+      uri: huggingface://bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF/ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 9c74d74f7bb0d05de4dd42ae2a8976eb06fe865c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 29 Mar 2025 14:42:14 +0100
Subject: [PATCH 08/59] feat(gguf): guess default context size from file
 (#5089)

feat(gguf): guess default config file from files

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/cli/run.go               |   2 +-
 core/config/backend_config.go |  12 +-
 core/config/gguf.go           | 253 ++++++++++++++++++++++++++++++++++
 core/config/guesser.go        | 239 ++------------------------------
 4 files changed, 264 insertions(+), 242 deletions(-)
 create mode 100644 core/config/gguf.go

diff --git a/core/cli/run.go b/core/cli/run.go
index 3162ef14..b245da67 100644
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -38,7 +38,7 @@ type RunCMD struct {
 
 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
 
 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index 56ffa38c..f7a6897c 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -389,16 +389,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}
 
-	// Value passed by the top level are treated as default (no implicit defaults)
-	// defaults are set by the user
-	if ctx == 0 {
-		ctx = 1024
-	}
-
-	if cfg.ContextSize == nil {
-		cfg.ContextSize = &ctx
-	}
-
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -420,7 +410,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
 
-	guessDefaultsFromFile(cfg, lo.modelPath)
+	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
 }
 
 func (c *BackendConfig) Validate() bool {
diff --git a/core/config/gguf.go b/core/config/gguf.go
new file mode 100644
index 00000000..cf9eacaa
--- /dev/null
+++ b/core/config/gguf.go
@@ -0,0 +1,253 @@
+package config
+
+import (
+	"strings"
+
+	"github.com/rs/zerolog/log"
+
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+	DeepSeek2
+)
+
+const (
+	defaultContextSize = 1024
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+	RepeatPenalty  float64
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		RepeatPenalty: 1.0,
+		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<start_of_turn>model\n",
+			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	DeepSeek2: {
+		StopWords: []string{"<｜end▁of▁sentence｜>"},
+		TemplateConfig: TemplateConfig{
+			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
+{{ end -}}
+{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
+{{if eq .RoleName "system" -}}{{.Content}}
+{{end -}}`,
+			Chat: "{{.Input -}}\nAssistant: ",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
+
+	if defaultCtx == 0 && cfg.ContextSize == nil {
+		ctxSize := f.EstimateLLaMACppUsage().ContextSize
+		if ctxSize > 0 {
+			cSize := int(ctxSize)
+			cfg.ContextSize = &cSize
+		} else {
+			defaultCtx = defaultContextSize
+			cfg.ContextSize = &defaultCtx
+		}
+	}
+
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
+		}
+		if cfg.RepeatPenalty == 0.0 {
+			cfg.RepeatPenalty = settings.RepeatPenalty
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+
+	if cfg.HasTemplate() {
+		return
+	}
+
+	// identify from well known templates first, otherwise use the raw jinja template
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found {
+		// try to use the jinja template
+		cfg.TemplateConfig.JinjaTemplate = true
+		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	deepseek2 := arch == "deepseek2"
+
+	switch {
+	case deepseek2:
+		return DeepSeek2
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
+	}
+}
diff --git a/core/config/guesser.go b/core/config/guesser.go
index 9c3ad59d..b7fb23de 100644
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -3,147 +3,12 @@ package config
 import (
 	"os"
 	"path/filepath"
-	"strings"
 
 	"github.com/rs/zerolog/log"
-
 	gguf "github.com/thxcode/gguf-parser-go"
 )
 
-type familyType uint8
-
-const (
-	Unknown familyType = iota
-	LLaMa3
-	CommandR
-	Phi3
-	ChatML
-	Mistral03
-	Gemma
-	DeepSeek2
-)
-
-type settingsConfig struct {
-	StopWords      []string
-	TemplateConfig TemplateConfig
-	RepeatPenalty  float64
-}
-
-// default settings to adopt with a given model family
-var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
-	Gemma: {
-		RepeatPenalty: 1.0,
-		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input }}\n<start_of_turn>model\n",
-			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
-			Completion:  "{{.Input}}",
-		},
-	},
-	DeepSeek2: {
-		StopWords: []string{"<｜end▁of▁sentence｜>"},
-		TemplateConfig: TemplateConfig{
-			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
-{{ end -}}
-{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
-{{if eq .RoleName "system" -}}{{.Content}}
-{{end -}}`,
-			Chat: "{{.Input -}}\nAssistant: ",
-		},
-	},
-	LLaMa3: {
-		StopWords: []string{"<|eot_id|>"},
-		TemplateConfig: TemplateConfig{
-			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
-			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
-		},
-	},
-	CommandR: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-You are a function calling AI model, you can call the following functions:
-## Available Tools
-{{range .Functions}}
-- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-{{end}}
-When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
-<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "system" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "assistant" -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if eq .RoleName "tool" -}}
-<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
-{{- else if .FunctionCall -}}
-<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
-{{- end -}}`,
-		},
-		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
-	},
-	Phi3: {
-		TemplateConfig: TemplateConfig{
-			Chat:        "{{.Input}}\n<|assistant|>",
-			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
-			Completion:  "{{.Input}}",
-		},
-		StopWords: []string{"<|end|>", "<|endoftext|>"},
-	},
-	ChatML: {
-		TemplateConfig: TemplateConfig{
-			Chat: "{{.Input -}}\n<|im_start|>assistant",
-			Functions: `<|im_start|>system
-You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-{{range .Functions}}
-{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-{{end}}
-For each function call return a json object with function name and arguments
-<|im_end|>
-{{.Input -}}
-<|im_start|>assistant`,
-			ChatMessage: `<|im_start|>{{ .RoleName }}
-{{ if .FunctionCall -}}
-Function call:
-{{ else if eq .RoleName "tool" -}}
-Function response:
-{{ end -}}
-{{ if .Content -}}
-{{.Content }}
-{{ end -}}
-{{ if .FunctionCall -}}
-{{toJson .FunctionCall}}
-{{ end -}}<|im_end|>`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
-	},
-	Mistral03: {
-		TemplateConfig: TemplateConfig{
-			Chat:      "{{.Input -}}",
-			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
-			ChatMessage: `{{if eq .RoleName "user" -}}
-[INST] {{.Content }} [/INST]
-{{- else if .FunctionCall -}}
-[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
-{{- else if eq .RoleName "tool" -}}
-[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
-{{- else -}}
-{{ .Content -}}
-{{ end -}}`,
-		},
-		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
-	},
-}
-
-// this maps well known template used in HF to model families defined above
-var knownTemplates = map[string]familyType{
-	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
-	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
-}
-
-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
-
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 		return
 	}
 
-	if cfg.HasTemplate() {
-		// nothing to guess here
-		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
-		return
-	}
-
 	// We try to guess only if we don't have a template defined already
 	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+
+	// try to parse the gguf file
 	f, err := gguf.ParseGGUFFile(guessPath)
-	if err != nil {
-		// Only valid for gguf files
-		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+	if err == nil {
+		guessGGUFFromFile(cfg, f, defaultCtx)
 		return
 	}
 
-	log.Debug().
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
-		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
-
-	// guess the name
-	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
-	}
-
-	family := identifyFamily(f)
-
-	if family == Unknown {
-		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
-		return
-	}
-
-	// identify template
-	settings, ok := defaultsSettings[family]
-	if ok {
-		cfg.TemplateConfig = settings.TemplateConfig
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
-		if len(cfg.StopWords) == 0 {
-			cfg.StopWords = settings.StopWords
+	if cfg.ContextSize == nil {
+		if defaultCtx == 0 {
+			defaultCtx = defaultContextSize
 		}
-		if cfg.RepeatPenalty == 0.0 {
-			cfg.RepeatPenalty = settings.RepeatPenalty
-		}
-	} else {
-		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
-	}
-
-	if cfg.HasTemplate() {
-		return
-	}
-
-	// identify from well known templates first, otherwise use the raw jinja template
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found {
-		// try to use the jinja template
-		cfg.TemplateConfig.JinjaTemplate = true
-		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
-	}
-}
-
-func identifyFamily(f *gguf.GGUFFile) familyType {
-
-	// identify from well known templates first
-	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
-	if found && chatTemplate.ValueString() != "" {
-		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
-			return family
-		}
-	}
-
-	// otherwise try to identify from the model properties
-	arch := f.Architecture().Architecture
-	eosTokenID := f.Tokenizer().EOSTokenID
-	bosTokenID := f.Tokenizer().BOSTokenID
-
-	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
-	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
-
-	llama3 := arch == "llama" && eosTokenID == 128009
-	commandR := arch == "command-r" && eosTokenID == 255001
-	qwen2 := arch == "qwen2"
-	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
-	deepseek2 := arch == "deepseek2"
-
-	switch {
-	case deepseek2:
-		return DeepSeek2
-	case gemma:
-		return Gemma
-	case llama3:
-		return LLaMa3
-	case commandR:
-		return CommandR
-	case phi3:
-		return Phi3
-	case qwen2, isYI:
-		return ChatML
-	default:
-		return Unknown
+		cfg.ContextSize = &defaultCtx
 	}
 }

From d4a10b43008a4da251f8b1a1b517e79438be857f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 29 Mar 2025 22:40:45 +0100
Subject: [PATCH 09/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `0bb2919335d00ff0bc79d5015da95c422de51f03` (#5095)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 397f2e63..dafd1a7d 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=b4ae50810e4304d052e630784c14bde7e79e4132
+CPPLLAMA_VERSION?=0bb2919335d00ff0bc79d5015da95c422de51f03
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From b34cf00819173e8d04c7be66e4bd8fa7f778dfea Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 30 Mar 2025 09:46:51 +0200
Subject: [PATCH 10/59] chore(model gallery): add galactic-qwen-14b-exp1
 (#5096)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index f881b0d8..93ce04f9 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5356,6 +5356,21 @@
     - filename: ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
       sha256: cbda41c638c23a3e8e9fb33c27ca0d0a0ee044b6813941a0017fd46369a35ec5
       uri: huggingface://bartowski/ChaoticNeutrals_Very_Berry_Qwen2_7B-GGUF/ChaoticNeutrals_Very_Berry_Qwen2_7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "galactic-qwen-14b-exp1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/SjM3y5Qcr2RX6zC3GQxR3.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Galactic-Qwen-14B-Exp1
+    - https://huggingface.co/mradermacher/Galactic-Qwen-14B-Exp1-GGUF
+  description: |
+    Galactic-Qwen-14B-Exp1 is based on the Qwen 2.5 14B modality architecture, designed to enhance the reasoning capabilities of 14B-parameter models. This model is optimized for general-purpose reasoning and answering, excelling in contextual understanding, logical deduction, and multi-step problem-solving. It has been fine-tuned using a long chain-of-thought reasoning model and specialized datasets to improve comprehension, structured responses, and conversational intelligence.
+  overrides:
+    parameters:
+      model: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+  files:
+    - filename: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+      sha256: 26e99578c341c879cc2676c4c7a45b6c0d00b30bd17c8ee7494fcc4092480ef0
+      uri: huggingface://mradermacher/Galactic-Qwen-14B-Exp1-GGUF/Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 7f3a02959618ce52ebf31a89e5a384757152e7f8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 30 Mar 2025 09:48:24 +0200
Subject: [PATCH 11/59] chore(model gallery): add
 forgotten-abomination-70b-v5.0 (#5097)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 93ce04f9..9faf5826 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1357,6 +1357,31 @@
     - filename: Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
       sha256: 8f3de344ba83705f9491c2ed354fd6131a05946fca2eceae495b6fc67a7dbe7a
       uri: huggingface://bartowski/Tarek07_Legion-V2.1-LLaMa-70B-GGUF/Tarek07_Legion-V2.1-LLaMa-70B-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "forgotten-abomination-70b-v5.0"
+  icon: https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0/resolve/main/waifu5.webp
+  urls:
+    - https://huggingface.co/ReadyArt/Forgotten-Abomination-70B-v5.0
+    - https://huggingface.co/mradermacher/Forgotten-Abomination-70B-v5.0-GGUF
+  description: |
+    The Unholy Union of Safeword and Nevoria
+    This model represents the forbidden merger of:
+
+    🧬 Forgotten-Safeword-70B-v5.0 - Industrial-grade depravity matrix with 23 distinct taboo specializations
+    ⚗️ L3.3-MS-Nevoria-70b - SteelSkull's masterpiece of storytelling and character adherence, free of positivity bias and ethical constraints
+    Expect:
+
+    🔥 Unparalleled erotic roleplaying with the least Llama refusals you've ever seen
+    📖 Novel-quality prose that follows your character card with precision
+    🧠 Handles complex multi-character scenarios effortlessly
+    💀 Will gleefully explore any taboo subject without hesitation
+  overrides:
+    parameters:
+      model: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+  files:
+    - filename: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+      sha256: a5f5e712e66b855f36ff45175f20c24441fa942ca8af47bd6f49107c6e0f025d
+      uri: huggingface://mradermacher/Forgotten-Abomination-70B-v5.0-GGUF/Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From ae625a4d001c3aa98be037cb3513f9df5bdad4d4 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 30 Mar 2025 09:50:21 +0200
Subject: [PATCH 12/59] chore(model gallery): add hammer2.0-7b (#5098)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 9faf5826..340943e1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5396,6 +5396,20 @@
     - filename: Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
       sha256: 26e99578c341c879cc2676c4c7a45b6c0d00b30bd17c8ee7494fcc4092480ef0
       uri: huggingface://mradermacher/Galactic-Qwen-14B-Exp1-GGUF/Galactic-Qwen-14B-Exp1.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "hammer2.0-7b"
+  urls:
+    - https://huggingface.co/MadeAgents/Hammer2.0-7b
+    - https://huggingface.co/Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF
+  description: |
+    Hammer2.0 finetuned based on Qwen 2.5 series and Qwen 2.5 coder series using function masking techniques. It's trained using the APIGen Function Calling Datasets containing 60,000 samples, supplemented by xlam-irrelevance-7.5k we generated. Hammer2.0 has achieved exceptional performances across numerous function calling benchmarks. For more details, please refer to Hammer: Robust Function-Calling for On-Device Language Models via Function Masking and Hammer GitHub repository .
+  overrides:
+    parameters:
+      model: hammer2.0-7b-q5_k_m.gguf
+  files:
+    - filename: hammer2.0-7b-q5_k_m.gguf
+      sha256: 3682843c857595765f0786cf24b3d501af96fe5d99a9fb2526bc7707e28bae1e
+      uri: huggingface://Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF/hammer2.0-7b-q5_k_m.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From c2a39e3639227cfd94ffffe9f5691239acc275a8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 30 Mar 2025 18:08:29 +0200
Subject: [PATCH 13/59] fix(llama.cpp): properly handle sigterm (#5099)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama/grpc-server.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index 883fbf8f..edd22c5a 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2122,7 +2122,11 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }
 
 std::function<void(int)> shutdown_handler;
-inline void signal_handler(int signal) { shutdown_handler(signal); }
+
+inline void signal_handler(int signal) {
+    exit(1);
+}
+
 
 /////////////////////////////////
 ////////////////////////////////
@@ -2649,6 +2653,20 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
   std::string server_address("localhost:50051");
 
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
   // Define long and short options
   struct option long_options[] = {
       {"addr", required_argument, nullptr, 'a'},

From 6d7ac09e96fc85fb45b4ce098b658b5040ed201b Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 30 Mar 2025 23:59:30 +0200
Subject: [PATCH 14/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `4663bd353c61c1136cd8a97b9908755e4ab30cec` (#5100)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index dafd1a7d..b203c0ae 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=0bb2919335d00ff0bc79d5015da95c422de51f03
+CPPLLAMA_VERSION?=4663bd353c61c1136cd8a97b9908755e4ab30cec
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 9a23fe662b66b3a2f0d75ad7a1c862823c145c32 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 31 Mar 2025 19:35:34 +0200
Subject: [PATCH 15/59] Update README.md

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ff6358ca..a2bef5d3 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,23 @@
 
 **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
 
-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](https://github.com/user-attachments/assets/9841b1ee-88af-4b96-8ec0-41b17364efa7) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](https://github.com/user-attachments/assets/d729f6f4-0621-4715-bda3-35fe6e159524) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](https://github.com/user-attachments/assets/3cf0b918-ba8e-498a-a3cd-485db5984325) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](https://github.com/user-attachments/assets/6753d23d-218b-4e07-94b8-9e6c5a4f2311) |
+
+| Chat Interface | API Overview |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](https://github.com/user-attachments/assets/048eab31-0f0c-4d52-a920-3715233f9bf3) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](https://github.com/user-attachments/assets/2540e8ce-1a2c-4c12-800c-763bd9be247f) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](https://github.com/user-attachments/assets/5af681b0-dd8e-4fe8-a234-a22f8a040547) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](https://github.com/user-attachments/assets/b9527176-63d6-4d2e-8ed1-7fde13a9b0ad) |
+
+## Quickstart
 
 Run the installer script:
 

From 65470b0ab13daf3327cd1b0bdfdc8c234feb29ea Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Mon, 31 Mar 2025 21:51:09 +0200
Subject: [PATCH 16/59] Update README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a2bef5d3..00e83d8b 100644
--- a/README.md
+++ b/README.md
@@ -108,6 +108,8 @@ local-ai run oci://localai/phi-2:latest
 
 ## 📰 Latest project news
 
+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )

From f09b33f2efa7debc58e7d2a18778a8b904830ba4 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Mon, 31 Mar 2025 22:48:03 +0200
Subject: [PATCH 17/59] docs: :arrow_up: update docs version mudler/LocalAI
 (#5104)

:arrow_up: Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 docs/data/version.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/data/version.json b/docs/data/version.json
index 750c0c8f..04c2b2d0 100644
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.26.0"
+  "version": "v2.27.0"
 }

From 2f9203cd2ac3f3f9fde6b9b20752911827a8de72 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 31 Mar 2025 22:48:17 +0200
Subject: [PATCH 18/59] chore: drop remoteLibraryURL from kong vars (#5103)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 main.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/main.go b/main.go
index 8da7bfcd..8dda313d 100644
--- a/main.go
+++ b/main.go
@@ -74,10 +74,9 @@ Version: ${version}
 		),
 		kong.UsageOnError(),
 		kong.Vars{
-			"basepath":         kong.ExpandPath("."),
-			"remoteLibraryURL": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml",
-			"galleries":        `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
-			"version":          internal.PrintableVersion(),
+			"basepath":  kong.ExpandPath("."),
+			"galleries": `[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]`,
+			"version":   internal.PrintableVersion(),
 		},
 	)
 

From 05f70044875e18747c08605740f3a59169e998bf Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 1 Apr 2025 00:01:10 +0200
Subject: [PATCH 19/59] fix: race during stop of active backends (#5106)

* chore: drop double call to stop all backends, refactors

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: do lock when cycling to models to delete

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .env                      |  3 +++
 pkg/model/initializers.go | 20 ++++++++++++--------
 pkg/model/loader.go       | 20 --------------------
 pkg/model/process.go      | 39 ++++++++++++++++++++++++++++++---------
 4 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/.env b/.env
index ee8db74e..86596105 100644
--- a/.env
+++ b/.env
@@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 
+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
+
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 1a51eb2a..12a1a972 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -473,8 +473,6 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 		backend = realBackend
 	}
 
-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
-
 	var backendToConsume string
 
 	switch backend {
@@ -497,13 +495,17 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 }
 
 func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
+	if !singleActiveBackend {
+		return
+	}
+
 	// If we can have only one backend active, kill all the others (except external backends)
-	if singleActiveBackend {
-		log.Debug().Msgf("Stopping all backends except '%s'", modelID)
-		err := ml.StopGRPC(allExcept(modelID))
-		if err != nil {
-			log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
-		}
+
+	// Stop all backends except the one we are going to load
+	log.Debug().Msgf("Stopping all backends except '%s'", modelID)
+	err := ml.StopGRPC(allExcept(modelID))
+	if err != nil {
+		log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
 	}
 }
 
@@ -520,10 +522,12 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 
 	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
 
+	// if a backend is defined, return the loader directly
 	if o.backendString != "" {
 		return ml.backendLoader(opts...)
 	}
 
+	// Otherwise scan for backends in the asset directory
 	var err error
 
 	// get backends embedded in the binary
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index bb9bdd8a..c25662d3 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -142,26 +142,6 @@ func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string,
 func (ml *ModelLoader) ShutdownModel(modelName string) error {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()
-	model, ok := ml.models[modelName]
-	if !ok {
-		return fmt.Errorf("model %s not found", modelName)
-	}
-
-	retries := 1
-	for model.GRPC(false, ml.wd).IsBusy() {
-		log.Debug().Msgf("%s busy. Waiting.", modelName)
-		dur := time.Duration(retries*2) * time.Second
-		if dur > retryTimeout {
-			dur = retryTimeout
-		}
-		time.Sleep(dur)
-		retries++
-
-		if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
-			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
-			break
-		}
-	}
 
 	return ml.deleteProcess(modelName)
 }
diff --git a/pkg/model/process.go b/pkg/model/process.go
index c27fbda3..2e8369a0 100644
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -9,25 +9,43 @@ import (
 	"strconv"
 	"strings"
 	"syscall"
+	"time"
 
 	"github.com/hpcloud/tail"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )
 
+var forceBackendShutdown bool = os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true"
+
 func (ml *ModelLoader) deleteProcess(s string) error {
+	model, ok := ml.models[s]
+	if !ok {
+		log.Debug().Msgf("Model %s not found", s)
+		return fmt.Errorf("model %s not found", s)
+	}
+
 	defer delete(ml.models, s)
 
+	retries := 1
+	for model.GRPC(false, ml.wd).IsBusy() {
+		log.Debug().Msgf("%s busy. Waiting.", s)
+		dur := time.Duration(retries*2) * time.Second
+		if dur > retryTimeout {
+			dur = retryTimeout
+		}
+		time.Sleep(dur)
+		retries++
+
+		if retries > 10 && forceBackendShutdown {
+			log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", s, retries)
+			break
+		}
+	}
+
 	log.Debug().Msgf("Deleting process %s", s)
 
-	m, exists := ml.models[s]
-	if !exists {
-		log.Error().Msgf("Model does not exist %s", s)
-		// Nothing to do
-		return nil
-	}
-
-	process := m.Process()
+	process := model.Process()
 	if process == nil {
 		log.Error().Msgf("No process for %s", s)
 		// Nothing to do as there is no process
@@ -44,9 +62,12 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 
 func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
 	var err error = nil
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+
 	for k, m := range ml.models {
 		if filter(k, m.Process()) {
-			e := ml.ShutdownModel(k)
+			e := ml.deleteProcess(k)
 			err = errors.Join(err, e)
 		}
 	}

From c59975ab05af8dddddf5859f2cefb94890c88bd4 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Tue, 1 Apr 2025 00:01:34 +0200
Subject: [PATCH 20/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `c80a7759dab10657b9b6c3e87eef988a133b9b6a` (#5105)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b203c0ae..1e36c123 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=4663bd353c61c1136cd8a97b9908755e4ab30cec
+CPPLLAMA_VERSION?=c80a7759dab10657b9b6c3e87eef988a133b9b6a
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 2c425e9c691e14d064b53827fbf6cb21fdaff855 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 1 Apr 2025 20:58:11 +0200
Subject: [PATCH 21/59] feat(loader): enhance single active backend by treating
 as singleton (#5107)

feat(loader): enhance single active backend by treating at singleton

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/application/application.go              |  2 +-
 core/application/startup.go                  |  2 +-
 core/backend/embeddings.go                   |  1 +
 core/backend/image.go                        |  1 +
 core/backend/llm.go                          |  1 +
 core/backend/options.go                      | 60 +++++++++-----------
 core/backend/rerank.go                       |  2 +-
 core/backend/soundgeneration.go              |  2 +-
 core/backend/token_metrics.go                |  1 +
 core/backend/tokenize.go                     |  2 +-
 core/backend/transcript.go                   |  1 +
 core/backend/tts.go                          |  2 +-
 core/backend/vad.go                          |  2 +
 core/cli/soundgeneration.go                  |  2 +-
 core/cli/transcript.go                       |  2 +-
 core/cli/tts.go                              |  2 +-
 core/http/endpoints/localai/stores.go        |  4 ++
 core/http/endpoints/openai/assistant_test.go |  2 +-
 core/http/routes/localai.go                  |  9 ++-
 pkg/model/initializers.go                    | 21 ++++++-
 pkg/model/loader.go                          | 17 +++---
 pkg/model/loader_options.go                  | 13 +----
 pkg/model/loader_test.go                     |  2 +-
 tests/integration/stores_test.go             | 10 ++--
 24 files changed, 92 insertions(+), 71 deletions(-)

diff --git a/core/application/application.go b/core/application/application.go
index 6e8d6204..8c9842d9 100644
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
diff --git a/core/application/startup.go b/core/application/startup.go
index 3cfbd684..6c93f03f 100644
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -143,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}
 
-	if options.LoadToMemory != nil {
+	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go
index a96e9829..aece0cdd 100644
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()
 
 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
diff --git a/core/backend/image.go b/core/backend/image.go
index 38ca4357..4b34f2cf 100644
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()
 
 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
diff --git a/core/backend/llm.go b/core/backend/llm.go
index 14eb8569..57e2ae35 100644
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()
 
 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
diff --git a/core/backend/options.go b/core/backend/options.go
index d98e136c..7a7a69bb 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
 
-	if so.SingleBackend {
-		defOpts = append(defOpts, model.WithSingleActiveBackend())
-	}
-
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -121,7 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word:    t.Word,
+			Word: t.Word,
 		})
 
 	}
@@ -161,33 +157,33 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:               c.MMProj,
-		FlashAttention:       c.FlashAttention,
-		CacheTypeKey:         c.CacheTypeK,
-		CacheTypeValue:       c.CacheTypeV,
-		NoKVOffload:          c.NoKVOffloading,
-		YarnExtFactor:        c.YarnExtFactor,
-		YarnAttnFactor:       c.YarnAttnFactor,
-		YarnBetaFast:         c.YarnBetaFast,
-		YarnBetaSlow:         c.YarnBetaSlow,
-		NGQA:                 c.NGQA,
-		RMSNormEps:           c.RMSNormEps,
-		MLock:                mmlock,
-		RopeFreqBase:         c.RopeFreqBase,
-		RopeScaling:          c.RopeScaling,
-		Type:                 c.ModelType,
-		RopeFreqScale:        c.RopeFreqScale,
-		NUMA:                 c.NUMA,
-		Embeddings:           embeddings,
-		LowVRAM:              lowVRAM,
-		NGPULayers:           int32(nGPULayers),
-		MMap:                 mmap,
-		MainGPU:              c.MainGPU,
-		Threads:              int32(*c.Threads),
-		TensorSplit:          c.TensorSplit,
+		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		MMProj:              c.MMProj,
+		FlashAttention:      c.FlashAttention,
+		CacheTypeKey:        c.CacheTypeK,
+		CacheTypeValue:      c.CacheTypeV,
+		NoKVOffload:         c.NoKVOffloading,
+		YarnExtFactor:       c.YarnExtFactor,
+		YarnAttnFactor:      c.YarnAttnFactor,
+		YarnBetaFast:        c.YarnBetaFast,
+		YarnBetaSlow:        c.YarnBetaSlow,
+		NGQA:                c.NGQA,
+		RMSNormEps:          c.RMSNormEps,
+		MLock:               mmlock,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeScaling:         c.RopeScaling,
+		Type:                c.ModelType,
+		RopeFreqScale:       c.RopeFreqScale,
+		NUMA:                c.NUMA,
+		Embeddings:          embeddings,
+		LowVRAM:             lowVRAM,
+		NGPULayers:          int32(nGPULayers),
+		MMap:                mmap,
+		MainGPU:             c.MainGPU,
+		Threads:             int32(*c.Threads),
+		TensorSplit:         c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
diff --git a/core/backend/rerank.go b/core/backend/rerank.go
index da565620..d7937ce4 100644
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()
 
 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
diff --git a/core/backend/soundgeneration.go b/core/backend/soundgeneration.go
index 49813d82..94ec9c89 100644
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,10 +26,10 @@ func SoundGeneration(
 
 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()
 
 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
diff --git a/core/backend/token_metrics.go b/core/backend/token_metrics.go
index cc71c868..ac34e34f 100644
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,6 +20,7 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
+	defer loader.Close()
 
 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go
index e04a59d8..43c46134 100644
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 
 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
-
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
+	defer loader.Close()
 
 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
diff --git a/core/backend/transcript.go b/core/backend/transcript.go
index 080f43b1..64f9c5e2 100644
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()
 
 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
diff --git a/core/backend/tts.go b/core/backend/tts.go
index e6191cfb..6157f4c1 100644
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,10 +23,10 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
-
 	if err != nil {
 		return "", nil, err
 	}
+	defer loader.Close()
 
 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
diff --git a/core/backend/vad.go b/core/backend/vad.go
index 8d148353..741dbb19 100644
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
+	defer ml.Close()
+
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
diff --git a/core/cli/soundgeneration.go b/core/cli/soundgeneration.go
index a8acd6ba..3c7e9af4 100644
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -74,7 +74,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 
 	defer func() {
 		err := ml.StopAllGRPC()
diff --git a/core/cli/transcript.go b/core/cli/transcript.go
index 7f5e6a9d..67b5ed1d 100644
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}
 
 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
diff --git a/core/cli/tts.go b/core/cli/tts.go
index af51ce06..283372fe 100644
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -41,7 +41,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 		AudioDir:          outputDir,
 		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 
 	defer func() {
 		err := ml.StopAllGRPC()
diff --git a/core/http/endpoints/localai/stores.go b/core/http/endpoints/localai/stores.go
index f417c580..dd8df8b1 100644
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,6 +21,7 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()
 
 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -48,6 +49,7 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
+		defer sl.Close()
 
 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -69,6 +71,7 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
+		defer sl.Close()
 
 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -100,6 +103,7 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
+		defer sl.Close()
 
 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
diff --git a/core/http/endpoints/openai/assistant_test.go b/core/http/endpoints/openai/assistant_test.go
index 6858f65d..90edb935 100644
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath)
+	var ml = model.NewModelLoader(modelPath, false)
 
 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go
index 20c571fd..ebf9c1c9 100644
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -50,11 +50,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Post("/v1/vad", vadChain...)
 
 	// Stores
-	sl := model.NewModelLoader("")
-	router.Post("/stores/set", localai.StoresSetEndpoint(sl, appConfig))
-	router.Post("/stores/delete", localai.StoresDeleteEndpoint(sl, appConfig))
-	router.Post("/stores/get", localai.StoresGetEndpoint(sl, appConfig))
-	router.Post("/stores/find", localai.StoresFindEndpoint(sl, appConfig))
+	router.Post("/stores/set", localai.StoresSetEndpoint(ml, appConfig))
+	router.Post("/stores/delete", localai.StoresDeleteEndpoint(ml, appConfig))
+	router.Post("/stores/get", localai.StoresGetEndpoint(ml, appConfig))
+	router.Post("/stores/find", localai.StoresFindEndpoint(ml, appConfig))
 
 	if !appConfig.DisableMetrics {
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index 12a1a972..1a7fdc9c 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -509,7 +509,23 @@ func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bo
 	}
 }
 
+func (ml *ModelLoader) Close() {
+	if !ml.singletonMode {
+		return
+	}
+	ml.singletonLock.Unlock()
+}
+
+func (ml *ModelLoader) lockBackend() {
+	if !ml.singletonMode {
+		return
+	}
+	ml.singletonLock.Lock()
+}
+
 func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
+	ml.lockBackend() // grab the singleton lock if needed
+
 	o := NewOptions(opts...)
 
 	// Return earlier if we have a model already loaded
@@ -520,7 +536,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		return m.GRPC(o.parallelRequests, ml.wd), nil
 	}
 
-	ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
+	ml.stopActiveBackends(o.modelID, ml.singletonMode)
 
 	// if a backend is defined, return the loader directly
 	if o.backendString != "" {
@@ -533,6 +549,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 	// get backends embedded in the binary
 	autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
 	if err != nil {
+		ml.Close() // we failed, release the lock
 		return nil, err
 	}
 
@@ -564,5 +581,7 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
 		}
 	}
 
+	ml.Close() // make sure to release the lock in case of failure
+
 	return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
 }
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index c25662d3..e74ea97b 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -18,16 +18,19 @@ import (
 
 // TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we seperate directories for .bin/.yaml and .tmpl
 type ModelLoader struct {
-	ModelPath string
-	mu        sync.Mutex
-	models    map[string]*Model
-	wd        *WatchDog
+	ModelPath     string
+	mu            sync.Mutex
+	singletonLock sync.Mutex
+	singletonMode bool
+	models        map[string]*Model
+	wd            *WatchDog
 }
 
-func NewModelLoader(modelPath string) *ModelLoader {
+func NewModelLoader(modelPath string, singleActiveBackend bool) *ModelLoader {
 	nml := &ModelLoader{
-		ModelPath: modelPath,
-		models:    make(map[string]*Model),
+		ModelPath:     modelPath,
+		models:        make(map[string]*Model),
+		singletonMode: singleActiveBackend,
 	}
 
 	return nml
diff --git a/pkg/model/loader_options.go b/pkg/model/loader_options.go
index c151d53b..28a7c598 100644
--- a/pkg/model/loader_options.go
+++ b/pkg/model/loader_options.go
@@ -17,10 +17,9 @@ type Options struct {
 
 	externalBackends map[string]string
 
-	grpcAttempts        int
-	grpcAttemptsDelay   int
-	singleActiveBackend bool
-	parallelRequests    bool
+	grpcAttempts      int
+	grpcAttemptsDelay int
+	parallelRequests  bool
 }
 
 type Option func(*Options)
@@ -88,12 +87,6 @@ func WithContext(ctx context.Context) Option {
 	}
 }
 
-func WithSingleActiveBackend() Option {
-	return func(o *Options) {
-		o.singleActiveBackend = true
-	}
-}
-
 func WithModelID(id string) Option {
 	return func(o *Options) {
 		o.modelID = id
diff --git a/pkg/model/loader_test.go b/pkg/model/loader_test.go
index 83e47ec6..a8e77bd2 100644
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@@ -21,7 +21,7 @@ var _ = Describe("ModelLoader", func() {
 		// Setup the model loader with a test directory
 		modelPath = "/tmp/test_model_path"
 		os.Mkdir(modelPath, 0755)
-		modelLoader = model.NewModelLoader(modelPath)
+		modelLoader = model.NewModelLoader(modelPath, false)
 	})
 
 	AfterEach(func() {
diff --git a/tests/integration/stores_test.go b/tests/integration/stores_test.go
index 9612bec0..5484a79c 100644
--- a/tests/integration/stores_test.go
+++ b/tests/integration/stores_test.go
@@ -70,7 +70,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				model.WithModel("test"),
 			}
 
-			sl = model.NewModelLoader("")
+			sl = model.NewModelLoader("", false)
 			sc, err = sl.Load(storeOpts...)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(sc).ToNot(BeNil())
@@ -235,7 +235,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 0.0}, {0.0, 1.0, 0.0}, {0.0, 0.0, 1.0}, {-1.0, 0.0, 0.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
 
-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())
 
 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@@ -247,7 +247,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 			keys := [][]float32{{1.0, 0.0, 1.0}, {0.0, 2.0, 0.0}, {0.0, 0.0, -1.0}, {-1.0, 0.0, -1.0}}
 			vals := [][]byte{[]byte("x"), []byte("y"), []byte("z"), []byte("-z")}
 
-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())
 
 			_, _, sims, err := store.Find(context.Background(), sc, keys[0], 4)
@@ -314,7 +314,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 
 			normalize(keys[6:])
 
-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())
 
 			expectTriangleEq(keys, vals)
@@ -341,7 +341,7 @@ var _ = Describe("Integration tests for the stores backend(s) and internal APIs"
 				c += 1
 			}
 
-			err := store.SetCols(context.Background(), sc, keys, vals);
+			err := store.SetCols(context.Background(), sc, keys, vals)
 			Expect(err).ToNot(HaveOccurred())
 
 			expectTriangleEq(keys, vals)

From cbbc954a8ca2da311eb79641ccce18a590205076 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 2 Apr 2025 09:22:53 +0200
Subject: [PATCH 22/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `f423981ac806bf031d83784bcb47d2721bc70f97` (#5108)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 1e36c123..13f7bb18 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=c80a7759dab10657b9b6c3e87eef988a133b9b6a
+CPPLLAMA_VERSION?=f423981ac806bf031d83784bcb47d2721bc70f97
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 7ee32884600ae98b1fec690dc74373b671aa8f01 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:15:57 +0200
Subject: [PATCH 23/59] chore(model gallery): add
 all-hands_openhands-lm-32b-v0.1 (#5111)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 340943e1..924144a3 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5410,6 +5410,36 @@
     - filename: hammer2.0-7b-q5_k_m.gguf
       sha256: 3682843c857595765f0786cf24b3d501af96fe5d99a9fb2526bc7707e28bae1e
       uri: huggingface://Nekuromento/Hammer2.0-7b-Q5_K_M-GGUF/hammer2.0-7b-q5_k_m.gguf
+- !!merge <<: *qwen25
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  name: "all-hands_openhands-lm-32b-v0.1"
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-32b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-32b-v0.1-GGUF
+  description: |
+    Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+      sha256: f7c2311d3264cc1e021a21a319748a9c75b74ddebe38551786aa4053448e5e74
+      uri: huggingface://bartowski/all-hands_openhands-lm-32b-v0.1-GGUF/all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 22060f641082ed3d121a94a112b3c9f1aadb47d6 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:17:57 +0200
Subject: [PATCH 24/59] chore(model gallery): add burtenshaw_gemmacoder3-12b
 (#5112)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 924144a3..a646908f 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -386,6 +386,21 @@
     - filename: Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
       sha256: 4c35a678e3784e20a8d85d4e7045d965509a1a71305a0da105fc5991ba7d6dc4
       uri: huggingface://mradermacher/Gemma-3-Starshine-12B-i1-GGUF/Gemma-3-Starshine-12B.i1-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "burtenshaw_gemmacoder3-12b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/62d648291fa3e4e7ae3fa6e8/zkcBr2UZFDpALAsMdgbze.gif
+  urls:
+    - https://huggingface.co/burtenshaw/GemmaCoder3-12B
+    - https://huggingface.co/bartowski/burtenshaw_GemmaCoder3-12B-GGUF
+  description: |
+    This model is a fine-tuned version of google/gemma-3-12b-it on the open-r1/codeforces-cots dataset. It has been trained using TRL.
+  overrides:
+    parameters:
+      model: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+  files:
+    - filename: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+      sha256: 47f0a2848eeed783cb03336afd8cc69f6ee0e088e3cec11ab6d9fe16457dc3d4
+      uri: huggingface://bartowski/burtenshaw_GemmaCoder3-12B-GGUF/burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
 - &eurollm
   name: "eurollm-9b-instruct"
   icon: https://openeurollm.eu/_next/static/media/logo-dark.e7001867.svg

From 89e151f035f1fb9bf845579377fde9bd14a7961c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:20:20 +0200
Subject: [PATCH 25/59] chore(model gallery): add
 all-hands_openhands-lm-7b-v0.1 (#5113)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index a646908f..c2779b76 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5455,6 +5455,36 @@
     - filename: all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
       sha256: f7c2311d3264cc1e021a21a319748a9c75b74ddebe38551786aa4053448e5e74
       uri: huggingface://bartowski/all-hands_openhands-lm-32b-v0.1-GGUF/all-hands_openhands-lm-32b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "all-hands_openhands-lm-7b-v0.1"
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-7b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-7b-v0.1-GGUF
+  description: |
+    This is a smaller 7B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+      sha256: d50031b04bbdad714c004a0dc117c18d26a026297c236cda36089c20279b2ec1
+      uri: huggingface://bartowski/all-hands_openhands-lm-7b-v0.1-GGUF/all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 18b320d577d610b806ac19f30bcb29278bcc1325 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:23:14 +0200
Subject: [PATCH 26/59] chore(deps): bump llama.cpp to
 'f01bd02376f919b05ee635f438311be8dfc91d7c (#5110)

chore(deps): bump llama.cpp to 'f01bd02376f919b05ee635f438311be8dfc91d7c'

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                          |  2 +-
 backend/cpp/llama/grpc-server.cpp | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index 13f7bb18..299a9b1d 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=f423981ac806bf031d83784bcb47d2721bc70f97
+CPPLLAMA_VERSION?=f01bd02376f919b05ee635f438311be8dfc91d7c
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index edd22c5a..ded46b1c 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -509,15 +509,15 @@ struct llama_server_context
     bool load_model(const common_params &params_)
     {
         params = params_;
-        if (!params.mmproj.empty()) {
+        if (!params.mmproj.path.empty()) {
             multimodal = true;
             LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.c_str(), clip_context_params {
+            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
                 /* use_gpu */ has_gpu,
                 /*verbosity=*/ 1,
             });
             if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
                 return false;
             }
 
@@ -531,7 +531,7 @@ struct llama_server_context
         ctx = common_init.context.release();
         if (model == nullptr)
         {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERR("unable to load model: %s", params.model.path.c_str());
             return false;
         }
 
@@ -2326,11 +2326,11 @@ static void params_parse(const backend::ModelOptions* request,
    
     // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
 
-    params.model = request->modelfile();
+    params.model.path = request->modelfile();
     if (!request->mmproj().empty()) {
     // get the directory of modelfile
-      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj = model_dir + "/"+ request->mmproj();
+      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      params.mmproj.path = model_dir + "/"+ request->mmproj();
     }
     //  params.model_alias ??
     params.model_alias =  request->modelfile();
@@ -2405,7 +2405,7 @@ static void params_parse(const backend::ModelOptions* request,
         scale_factor = request->lorascale();
      }
      // get the directory of modelfile
-     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
+     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
      params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
     }
     params.use_mlock = request->mlock();

From 259ad3cfe61bd3e13fb12941875843416ea6d2c9 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:25:46 +0200
Subject: [PATCH 27/59] chore(model gallery): add
 all-hands_openhands-lm-1.5b-v0.1 (#5114)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index c2779b76..feccdb10 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5485,6 +5485,36 @@
     - filename: all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
       sha256: d50031b04bbdad714c004a0dc117c18d26a026297c236cda36089c20279b2ec1
       uri: huggingface://bartowski/all-hands_openhands-lm-7b-v0.1-GGUF/all-hands_openhands-lm-7b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "all-hands_openhands-lm-1.5b-v0.1"
+  icon: https://github.com/All-Hands-AI/OpenHands/blob/main/docs/static/img/logo.png?raw=true
+  urls:
+    - https://huggingface.co/all-hands/openhands-lm-1.5b-v0.1
+    - https://huggingface.co/bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF
+  description: |
+    This is a smaller 1.5B model trained following the recipe of all-hands/openhands-lm-32b-v0.1. It is intended to be used for speculative decoding. Autonomous agents for software development are already contributing to a wide range of software development tasks. But up to this point, strong coding agents have relied on proprietary models, which means that even if you use an open-source agent like OpenHands, you are still reliant on API calls to an external service.
+
+    Today, we are excited to introduce OpenHands LM, a new open coding model that:
+
+        Is open and available on Hugging Face, so you can download it and run it locally
+        Is a reasonable size, 32B, so it can be run locally on hardware such as a single 3090 GPU
+        Achieves strong performance on software engineering tasks, including 37.2% resolve rate on SWE-Bench Verified
+
+    Read below for more details and our future plans!
+    What is OpenHands LM?
+
+    OpenHands LM is built on the foundation of Qwen Coder 2.5 Instruct 32B, leveraging its powerful base capabilities for coding tasks. What sets OpenHands LM apart is our specialized fine-tuning process:
+
+        We used training data generated by OpenHands itself on a diverse set of open-source repositories
+        Specifically, we use an RL-based framework outlined in SWE-Gym, where we set up a training environment, generate training data using an existing agent, and then fine-tune the model on examples that were resolved successfully
+        It features a 128K token context window, ideal for handling large codebases and long-horizon software engineering tasks
+  overrides:
+    parameters:
+      model: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+  files:
+    - filename: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+      sha256: 30abd7860c4eb5f2f51546389407b0064360862f64ea55cdf95f97c6e155b3c6
+      uri: huggingface://bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF/all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.ggu
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From d2cf8ef0708592f633a49b2a14bd224aa3e0687e Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Thu, 3 Apr 2025 15:22:59 +0100
Subject: [PATCH 28/59] fix(sycl): kernel not found error by forcing -fsycl
 (#5115)

* chore(sycl): Update oneapi to 2025:1

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(sycl): Pass -fsycl flag as workaround

-fsycl should be set by llama.cpp's cmake file, but something goes wrong
and it doesn't appear to get added

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(build): Speed up llama build by using all CPUs

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
---
 .github/workflows/generate_intel_image.yaml |  2 +-
 Makefile                                    |  4 ++--
 backend/cpp/llama/Makefile                  | 17 ++++++++++++-----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/generate_intel_image.yaml b/.github/workflows/generate_intel_image.yaml
index 8283964c..20ce1d5a 100644
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
             runs-on: 'ubuntu-latest'
             platforms: 'linux/amd64'
     runs-on: ${{matrix.runs-on}}
diff --git a/Makefile b/Makefile
index 299a9b1d..3c61d6f4 100644
--- a/Makefile
+++ b/Makefile
@@ -809,7 +809,7 @@ docker-aio-all:
 
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -817,7 +817,7 @@ docker-image-intel:
 
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile
index 17f55003..e36dc7c2 100644
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif
 
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
 endif
 
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 
 llama.cpp:
@@ -73,8 +80,8 @@ grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)"
 else
-	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) -j$(nproc)
 endif
-	cp llama.cpp/build/bin/grpc-server .
\ No newline at end of file
+	cp llama.cpp/build/bin/grpc-server .

From 6af3f46bc37c2d1e3fc623008bf4dbf56d114111 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 4 Apr 2025 00:59:49 +0200
Subject: [PATCH 29/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `c262beddf29f3f3be5bbbf167b56029a19876956` (#5116)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 3c61d6f4..d7f8b3e2 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=f01bd02376f919b05ee635f438311be8dfc91d7c
+CPPLLAMA_VERSION?=c262beddf29f3f3be5bbbf167b56029a19876956
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 128612a6fc70c37aeb45fd4cdafe2310f65fb2d7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:21:45 +0200
Subject: [PATCH 30/59] chore(model gallery): add gemma-3-12b-it-qat (#5117)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index feccdb10..804d5651 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -78,6 +78,24 @@
     - filename: gemma-3-1b-it-Q4_K_M.gguf
       sha256: 8ccc5cd1f1b3602548715ae25a66ed73fd5dc68a210412eea643eb20eb75a135
       uri: huggingface://ggml-org/gemma-3-1b-it-GGUF/gemma-3-1b-it-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-12b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-12b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 12B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-12b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-12b-it-q4_0.gguf
+      sha256: 6f1bb5f455414f7b46482bda51cbfdbf19786e21a5498c4403fdfc03d09b045c
+      uri: huggingface://vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf/gemma-3-12b-it-q4_0.gguf
 - !!merge <<: *gemma3
   name: "qgallouedec_gemma-3-27b-it-codeforces-sft"
   urls:

From 31a7084c759cd771594eaee55946e6b09664fc13 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:23:56 +0200
Subject: [PATCH 31/59] chore(model gallery): add gemma-3-4b-it-qat (#5118)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 804d5651..ac5022d0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -96,6 +96,24 @@
     - filename: gemma-3-12b-it-q4_0.gguf
       sha256: 6f1bb5f455414f7b46482bda51cbfdbf19786e21a5498c4403fdfc03d09b045c
       uri: huggingface://vinimuchulski/gemma-3-12b-it-qat-q4_0-gguf/gemma-3-12b-it-q4_0.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-4b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-4b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 4B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-4b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-4b-it-q4_0.gguf
+      sha256: 2ca493d426ffcb43db27132f183a0230eda4a3621e58b328d55b665f1937a317
+      uri: huggingface://vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf/gemma-3-4b-it-q4_0.gguf
 - !!merge <<: *gemma3
   name: "qgallouedec_gemma-3-27b-it-codeforces-sft"
   urls:

From d26e61388b19a1950bfdcdb11d4e78cf01845710 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:27:52 +0200
Subject: [PATCH 32/59] chore(model gallery): add tesslate_synthia-s1-27b
 (#5119)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index ac5022d0..d02b0778 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -437,6 +437,21 @@
     - filename: burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
       sha256: 47f0a2848eeed783cb03336afd8cc69f6ee0e088e3cec11ab6d9fe16457dc3d4
       uri: huggingface://bartowski/burtenshaw_GemmaCoder3-12B-GGUF/burtenshaw_GemmaCoder3-12B-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "tesslate_synthia-s1-27b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64d1129297ca59bcf7458d07/zgFDl7UvWhiPYqdote7XT.png
+  urls:
+    - https://huggingface.co/Tesslate/Synthia-S1-27b
+    - https://huggingface.co/bartowski/Tesslate_Synthia-S1-27b-GGUF
+  description: |
+    Synthia-S1-27b is a reasoning, AI model developed by Tesslate AI, fine-tuned specifically for advanced reasoning, coding, and RP usecases. Built upon the robust Gemma3 architecture, Synthia-S1-27b excels in logical reasoning, creative writing, and deep contextual understanding. It supports multimodal inputs (text and images) with a large 128K token context window, enabling complex analysis suitable for research, academic tasks, and enterprise-grade AI applications.
+  overrides:
+    parameters:
+      model: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+  files:
+    - filename: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+      sha256: d953bf7f802dc68f85a35360deb24b9a8b446af051e82c77f2f0759065d2aa71
+      uri: huggingface://bartowski/Tesslate_Synthia-S1-27b-GGUF/Tesslate_Synthia-S1-27b-Q4_K_M.gguf
 - &eurollm
   name: "eurollm-9b-instruct"
   icon: https://openeurollm.eu/_next/static/media/logo-dark.e7001867.svg

From 9e6dbb0b5a1be44dcd82800c4bed89a1675ef260 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:29:47 +0200
Subject: [PATCH 33/59] chore(model gallery): add
 katanemo_arch-function-chat-7b (#5120)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index d02b0778..2dd3d4e2 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5565,7 +5565,29 @@
   files:
     - filename: all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
       sha256: 30abd7860c4eb5f2f51546389407b0064360862f64ea55cdf95f97c6e155b3c6
-      uri: huggingface://bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF/all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.ggu
+      uri: huggingface://bartowski/all-hands_openhands-lm-1.5b-v0.1-GGUF/all-hands_openhands-lm-1.5b-v0.1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-7b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-7B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-7B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+      sha256: 6fd603511076ffea3697c8a76d82c054781c5e11f134b937a66cedfc49b3d2c5
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-7B-GGUF/katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 0064bec8f5bbced35705db6f90ca31ae6f748bf3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:31:44 +0200
Subject: [PATCH 34/59] chore(model gallery): add
 katanemo_arch-function-chat-1.5b (#5121)

---
 gallery/index.yaml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 2dd3d4e2..17e9b070 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5588,6 +5588,28 @@
     - filename: katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
       sha256: 6fd603511076ffea3697c8a76d82c054781c5e11f134b937a66cedfc49b3d2c5
       uri: huggingface://bartowski/katanemo_Arch-Function-Chat-7B-GGUF/katanemo_Arch-Function-Chat-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-1.5b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-1.5B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+      sha256: 5bfcb72803745c374a90b0ceb60f347a8c7d1239960cce6a2d22cc1276236098
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF/katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 106e40845f2afa75d8d49366aae383924a850bb5 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Fri, 4 Apr 2025 10:45:44 +0200
Subject: [PATCH 35/59] chore(model gallery): add
 katanemo_arch-function-chat-3b (#5122)

---
 gallery/index.yaml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 17e9b070..464eed52 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5610,6 +5610,28 @@
     - filename: katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
       sha256: 5bfcb72803745c374a90b0ceb60f347a8c7d1239960cce6a2d22cc1276236098
       uri: huggingface://bartowski/katanemo_Arch-Function-Chat-1.5B-GGUF/katanemo_Arch-Function-Chat-1.5B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "katanemo_arch-function-chat-3b"
+  urls:
+    - https://huggingface.co/katanemo/Arch-Function-Chat-3B
+    - https://huggingface.co/bartowski/katanemo_Arch-Function-Chat-3B-GGUF
+  description: |
+    The Arch-Function-Chat collection builds upon the Katanemo's Arch-Function collection by extending its capabilities beyond function calling. This new collection maintains the state-of-the-art(SOTA) function calling performance of the original collection while adding powerful new features that make it even more versatile in real-world applications.
+
+    In addition to function calling capabilities, this collection now offers:
+
+        Clarify & refine: Generates natural follow-up questions to collect missing information for function calling
+        Interpret & respond: Provides human-friendly responses based on function execution results
+        Context management: Mantains context in complex multi-turn interactions
+
+    Note: Arch-Function-Chat is now the primarly LLM used in then open source Arch Gateway - An AI-native proxy for agents. For more details about the project, check out the Github README.
+  overrides:
+    parameters:
+      model: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+  files:
+    - filename: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+      sha256: f59dbef397bf1364b5f0a2c23a7f67c40ec63cc666036c4e7615fa7d79d4e1a0
+      uri: huggingface://bartowski/katanemo_Arch-Function-Chat-3B-GGUF/katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From b88a7a4550ccc292951c73f77793a41d522378a7 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Fri, 4 Apr 2025 23:49:53 +0200
Subject: [PATCH 36/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `3e1d29348b5d77269f6931500dd1c1a729d429c8` (#5123)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d7f8b3e2..220930b6 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=c262beddf29f3f3be5bbbf167b56029a19876956
+CPPLLAMA_VERSION?=3e1d29348b5d77269f6931500dd1c1a729d429c8
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 901dba60635db363537aeadfd297f54f3892facb Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sat, 5 Apr 2025 08:46:49 +0200
Subject: [PATCH 37/59] chore(model gallery): add gemma-3-27b-it-qat (#5124)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 464eed52..80fc5755 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -114,6 +114,24 @@
     - filename: gemma-3-4b-it-q4_0.gguf
       sha256: 2ca493d426ffcb43db27132f183a0230eda4a3621e58b328d55b665f1937a317
       uri: huggingface://vinimuchulski/gemma-3-4b-it-qat-q4_0-gguf/gemma-3-4b-it-q4_0.gguf
+- !!merge <<: *gemma3
+  name: "gemma-3-27b-it-qat"
+  urls:
+    - https://huggingface.co/google/gemma-3-27b-it
+    - https://huggingface.co/vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf
+  description: |
+    This model corresponds to the 27B instruction-tuned version of the Gemma 3 model in GGUF format using Quantization Aware Training (QAT). The GGUF corresponds to Q4_0 quantization.
+
+    Thanks to QAT, the model is able to preserve similar quality as bfloat16 while significantly reducing the memory requirements to load the model.
+
+    You can find the half-precision version here.
+  overrides:
+    parameters:
+      model: gemma-3-27b-it-q4_0.gguf
+  files:
+    - filename: gemma-3-27b-it-q4_0.gguf
+      sha256: 45e586879bc5f5d7a5b6527e812952057ce916d9fc7ba16f7262ec9972c9e2a2
+      uri: huggingface://vinimuchulski/gemma-3-27b-it-qat-q4_0-gguf/gemma-3-27b-it-q4_0.gguf
 - !!merge <<: *gemma3
   name: "qgallouedec_gemma-3-27b-it-codeforces-sft"
   urls:

From fc73b2b4307ff604b3583ae51368e5b90f2897fb Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 6 Apr 2025 10:48:21 +0200
Subject: [PATCH 38/59] chore(model gallery): add
 open-thoughts_openthinker2-32b (#5128)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 80fc5755..9bea9593 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5650,6 +5650,23 @@
     - filename: katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
       sha256: f59dbef397bf1364b5f0a2c23a7f67c40ec63cc666036c4e7615fa7d79d4e1a0
       uri: huggingface://bartowski/katanemo_Arch-Function-Chat-3B-GGUF/katanemo_Arch-Function-Chat-3B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "open-thoughts_openthinker2-32b"
+  icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
+  urls:
+    - https://huggingface.co/open-thoughts/OpenThinker2-32B
+    - https://huggingface.co/bartowski/open-thoughts_OpenThinker2-32B-GGUF
+  description: |
+    This model is a fine-tuned version of Qwen/Qwen2.5-32B-Instruct on the OpenThoughts2-1M dataset.
+
+    The OpenThinker2-32B model is the highest performing open-data model. This model improves upon our previous OpenThinker-32B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
+  overrides:
+    parameters:
+      model: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+  files:
+    - filename: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+      sha256: e9c7bf7cb349cfe07b4550759a3b4d7005834d0fa7580b23e483cbfeecd7a982
+      uri: huggingface://bartowski/open-thoughts_OpenThinker2-32B-GGUF/open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 3b8bc7e64c6daa767f607305f9467d6369189dbe Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 6 Apr 2025 10:53:22 +0200
Subject: [PATCH 39/59] chore(model gallery): add open-thoughts_openthinker2-7b
 (#5129)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 9bea9593..36ca62d0 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5667,6 +5667,23 @@
     - filename: open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
       sha256: e9c7bf7cb349cfe07b4550759a3b4d7005834d0fa7580b23e483cbfeecd7a982
       uri: huggingface://bartowski/open-thoughts_OpenThinker2-32B-GGUF/open-thoughts_OpenThinker2-32B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "open-thoughts_openthinker2-7b"
+  icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.pnghttps://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
+  urls:
+    - https://huggingface.co/open-thoughts/OpenThinker2-7B
+    - https://huggingface.co/bartowski/open-thoughts_OpenThinker2-7B-GGUF
+  description: |
+    This model is a fine-tuned version of Qwen/Qwen2.5-7B-Instruct on the OpenThoughts2-1M dataset.
+
+    The OpenThinker2-7B model is the top 7B open-data reasoning model. It delivers performance comparable to state of the art 7B models like DeepSeek-R1-Distill-7B across a suite of tasks. This model improves upon our previous OpenThinker-7B model, which was trained on 114k examples from OpenThoughts-114k. The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
+  overrides:
+    parameters:
+      model: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+  files:
+    - filename: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+      sha256: 481d785047d66ae2eeaf14650a9e659ec4f7766a6414b6c7e92854c944201734
+      uri: huggingface://bartowski/open-thoughts_OpenThinker2-7B-GGUF/open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From ece239966f0a6029dfbf0eb75f55d3cda7380467 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 6 Apr 2025 14:01:51 +0200
Subject: [PATCH 40/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `6bf28f0111ff9f21b3c1b1eace20c590281e7ba6` (#5127)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                          | 2 +-
 backend/cpp/llama/CMakeLists.txt  | 2 +-
 backend/cpp/llama/grpc-server.cpp | 2 +-
 backend/cpp/llama/prepare.sh      | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 220930b6..41ce34e7 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=3e1d29348b5d77269f6931500dd1c1a729d429c8
+CPPLLAMA_VERSION?=6bf28f0111ff9f21b3c1b1eace20c590281e7ba6
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/CMakeLists.txt b/backend/cpp/llama/CMakeLists.txt
index 031e4964..2cd5ffd7 100644
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index ded46b1c..18dfdc64 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -514,7 +514,7 @@ struct llama_server_context
             LOG_INFO("Multi Modal Mode Enabled", {});
             clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
                 /* use_gpu */ has_gpu,
-                /*verbosity=*/ 1,
+                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
             });
             if(clp_ctx == nullptr) {
                 LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
diff --git a/backend/cpp/llama/prepare.sh b/backend/cpp/llama/prepare.sh
index 4c8393b9..eabd93c5 100644
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -21,6 +21,7 @@ fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h

From 5018452be7f4e4cddbff1571b994f2e96aed63ae Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sun, 6 Apr 2025 23:51:51 +0200
Subject: [PATCH 41/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `916c83bfe7f8b08ada609c3b8e583cf5301e594b` (#5130)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 41ce34e7..66532c43 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=6bf28f0111ff9f21b3c1b1eace20c590281e7ba6
+CPPLLAMA_VERSION?=916c83bfe7f8b08ada609c3b8e583cf5301e594b
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 710f624ecd7f86419359451f42871a5f7c7b5d7c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 7 Apr 2025 18:03:25 +0200
Subject: [PATCH 42/59] fix(webui): improve model display, do not block view
 (#5133)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/elements/gallery.go | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/core/http/elements/gallery.go b/core/http/elements/gallery.go
index 539627e4..589604cd 100644
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
+			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl max-h-full",
+				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
+					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
 				},
 				// header
 				elem.Div(
@@ -164,14 +164,13 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4",
+						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
-							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@@ -232,7 +231,6 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
-
 }
 
 func modelDescription(m *gallery.GalleryModel) elem.Node {

From a6f0bb410fdee8060b00f1fc92ac63e5173c2c15 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Apr 2025 21:09:45 +0000
Subject: [PATCH 43/59] chore(deps): bump securego/gosec from 2.22.0 to 2.22.3
 (#5134)

Bumps [securego/gosec](https://github.com/securego/gosec) from 2.22.0 to 2.22.3.
- [Release notes](https://github.com/securego/gosec/releases)
- [Changelog](https://github.com/securego/gosec/blob/master/.goreleaser.yml)
- [Commits](https://github.com/securego/gosec/compare/v2.22.0...v2.22.3)

---
updated-dependencies:
- dependency-name: securego/gosec
  dependency-version: 2.22.3
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/secscan.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
index 228ac1d9..2122fa76 100644
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
         if: ${{ github.actor != 'dependabot[bot]' }}
       - name: Run Gosec Security Scanner
         if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.3
         with:
           # we let the report trigger content trigger a failure using the GitHub Security features.
           args: '-no-fail -fmt sarif -out results.sarif ./...'

From 547d322b28f07e1b47d2b8cbe9fc1c682fa0bdcf Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 09:40:26 +0200
Subject: [PATCH 44/59] chore(model gallery): add arliai_qwq-32b-arliai-rpr-v
 (#5137)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 36ca62d0..063ad862 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5684,6 +5684,31 @@
     - filename: open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
       sha256: 481d785047d66ae2eeaf14650a9e659ec4f7766a6414b6c7e92854c944201734
       uri: huggingface://bartowski/open-thoughts_OpenThinker2-7B-GGUF/open-thoughts_OpenThinker2-7B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "arliai_qwq-32b-arliai-rpr-v1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6625f4a8a8d1362ebcc3851a/albSlnUy9dPVGVuLlsBua.jpeg
+  urls:
+    - https://huggingface.co/ArliAI/QwQ-32B-ArliAI-RpR-v1
+    - https://huggingface.co/bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF
+  description: |
+    RpR (RolePlay with Reasoning) is a new series of models from ArliAI. This series builds directly upon the successful dataset curation methodology and training methods developed for the RPMax series.
+
+    RpR models use the same curated, deduplicated RP and creative writing dataset used for RPMax, with a focus on variety to ensure high creativity and minimize cross-context repetition. Users familiar with RPMax will recognize the unique, non-repetitive writing style unlike other finetuned-for-RP models.
+
+    With the release of QwQ as the first high performing open-source reasoning model that can be easily trained, it was clear that the available instruct and creative writing reasoning datasets contains only one response per example. This is type of single response dataset used for training reasoning models causes degraded output quality in long multi-turn chats. Which is why Arli AI decided to create a real RP model capable of long multi-turn chat with reasoning.
+
+    In order to create RpR, we first had to actually create the reasoning RP dataset by re-processing our existing known-good RPMax dataset into a reasoning dataset. This was possible by using the base QwQ Instruct model itself to create the reasoning process for every turn in the RPMax dataset conversation examples, which is then further refined in order to make sure the reasoning is in-line with the actual response examples from the dataset.
+
+    Another important thing to get right is to make sure the model is trained on examples that present reasoning blocks in the same way as it encounters it during inference. Which is, never seeing the reasoning blocks in it's context. In order to do this, the training run was completed using axolotl with manual template-free segments dataset in order to make sure that the model is never trained to see the reasoning block in the context. Just like how the model will be used during inference time.
+
+    The result of training QwQ on this dataset with this method are consistently coherent and interesting outputs even in long multi-turn RP chats. This is as far as we know the first true correctly-trained reasoning model trained for RP and creative writing.
+  overrides:
+    parameters:
+      model: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+  files:
+    - filename: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+      sha256: b0f2ca8f62a5d021e20db40608a109713e9d23e75b68b3b71b7654c04d596dcf
+      uri: huggingface://bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF/ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From c09d227647ab12912d432509597c274e21f2d23e Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 09:42:49 +0200
Subject: [PATCH 45/59] chore(model gallery): add watt-ai_watt-tool-70b (#5138)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 063ad862..4d52b63c 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1466,6 +1466,31 @@
     - filename: Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
       sha256: a5f5e712e66b855f36ff45175f20c24441fa942ca8af47bd6f49107c6e0f025d
       uri: huggingface://mradermacher/Forgotten-Abomination-70B-v5.0-GGUF/Forgotten-Abomination-70B-v5.0.Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "watt-ai_watt-tool-70b"
+  urls:
+    - https://huggingface.co/watt-ai/watt-tool-70B
+    - https://huggingface.co/bartowski/watt-ai_watt-tool-70B-GGUF
+  description: |
+    watt-tool-70B is a fine-tuned language model based on LLaMa-3.3-70B-Instruct, optimized for tool usage and multi-turn dialogue. It achieves state-of-the-art performance on the Berkeley Function-Calling Leaderboard (BFCL).
+    Model Description
+
+    This model is specifically designed to excel at complex tool usage scenarios that require multi-turn interactions, making it ideal for empowering platforms like Lupan, an AI-powered workflow building tool. By leveraging a carefully curated and optimized dataset, watt-tool-70B demonstrates superior capabilities in understanding user requests, selecting appropriate tools, and effectively utilizing them across multiple turns of conversation.
+
+    Target Application: AI Workflow Building as in https://lupan.watt.chat/ and Coze.
+    Key Features
+
+        Enhanced Tool Usage: Fine-tuned for precise and efficient tool selection and execution.
+        Multi-Turn Dialogue: Optimized for maintaining context and effectively utilizing tools across multiple turns of conversation, enabling more complex task completion.
+        State-of-the-Art Performance: Achieves top performance on the BFCL, demonstrating its capabilities in function calling and tool usage.
+        Based on LLaMa-3.1-70B-Instruct: Inherits the strong language understanding and generation capabilities of the base model.
+  overrides:
+    parameters:
+      model: watt-ai_watt-tool-70B-Q4_K_M.gguf
+  files:
+    - filename: watt-ai_watt-tool-70B-Q4_K_M.gguf
+      sha256: 93806a5482b9e40e50ffca7a72abe3414d384749cc9e3d378eab5db8a8154b18
+      uri: huggingface://bartowski/watt-ai_watt-tool-70B-GGUF/watt-ai_watt-tool-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From 59c37e67b21d3c3aa815fdf8109ead2658c2b936 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 09:56:29 +0200
Subject: [PATCH 46/59] chore(model gallery): add eurydice-24b-v2-i1 (#5139)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 4d52b63c..0cef7459 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -9664,6 +9664,21 @@
     - filename: BlackSheep-24B.i1-Q4_K_M.gguf
       sha256: 95ae096eca05a95591254babf81b4d5617ceebbe8eda04c6cf8968ef4a69fc80
       uri: huggingface://mradermacher/BlackSheep-24B-i1-GGUF/BlackSheep-24B.i1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "eurydice-24b-v2-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/652c2a63d78452c4742cd3d3/Hm_tg4s0D6yWmtrTHII32.png
+  urls:
+    - https://huggingface.co/aixonlab/Eurydice-24b-v2
+    - https://huggingface.co/mradermacher/Eurydice-24b-v2-i1-GGUF
+  description: |
+    Eurydice 24b v2 is designed to be the perfect companion for multi-role conversations. It demonstrates exceptional contextual understanding and excels in creativity, natural conversation and storytelling. Built on Mistral 3.1, this model has been trained on a custom dataset specifically crafted to enhance its capabilities.
+  overrides:
+    parameters:
+      model: Eurydice-24b-v2.i1-Q4_K_M.gguf
+  files:
+    - filename: Eurydice-24b-v2.i1-Q4_K_M.gguf
+      sha256: fb4104a1b33dd860e1eca3b6906a10cacc5b91a2534db72d9749652a204fbcbf
+      uri: huggingface://mradermacher/Eurydice-24b-v2-i1-GGUF/Eurydice-24b-v2.i1-Q4_K_M.gguf
 - &mudler
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
   name: "LocalAI-llama3-8b-function-call-v0.2"

From 7387932f898b6780b10340b928052c4b984c479c Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 10:01:24 +0200
Subject: [PATCH 47/59] chore(model gallery): add mensa-beta-14b-instruct-i1
 (#5140)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 0cef7459..22df7506 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5734,6 +5734,21 @@
     - filename: ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
       sha256: b0f2ca8f62a5d021e20db40608a109713e9d23e75b68b3b71b7654c04d596dcf
       uri: huggingface://bartowski/ArliAI_QwQ-32B-ArliAI-RpR-v1-GGUF/ArliAI_QwQ-32B-ArliAI-RpR-v1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "mensa-beta-14b-instruct-i1"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/DyO5Fvqwvee-UM9QqgWZS.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
+    - https://huggingface.co/mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF
+  description: |
+    weighted/imatrix quants of https://huggingface.co/prithivMLmods/Mensa-Beta-14B-Instruct
+  overrides:
+    parameters:
+      model: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+  files:
+    - filename: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+      sha256: 86ccd640d72dcf3129fdd5b94381a733a684672b22487784e388b2ee9de57760
+      uri: huggingface://mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF/Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 4fbd6609f277a5c959e5110b823e663b5a875cee Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 10:12:28 +0200
Subject: [PATCH 48/59] chore(model gallery): add
 meta-llama_llama-4-scout-17b-16e-instruct (#5141)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 22df7506..96caade1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -470,6 +470,31 @@
     - filename: Tesslate_Synthia-S1-27b-Q4_K_M.gguf
       sha256: d953bf7f802dc68f85a35360deb24b9a8b446af051e82c77f2f0759065d2aa71
       uri: huggingface://bartowski/Tesslate_Synthia-S1-27b-GGUF/Tesslate_Synthia-S1-27b-Q4_K_M.gguf
+- &llama4
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  icon: https://avatars.githubusercontent.com/u/153379578
+  license: llama4
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - llama3.3
+  name: "meta-llama_llama-4-scout-17b-16e-instruct"
+  urls:
+    - https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
+    - https://huggingface.co/bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF
+  description: |
+    The Llama 4 collection of models are natively multimodal AI models that enable text and multimodal experiences. These models leverage a mixture-of-experts architecture to offer industry-leading performance in text and image understanding.
+
+    These Llama 4 models mark the beginning of a new era for the Llama ecosystem. We are launching two efficient models in the Llama 4 series, Llama 4 Scout, a 17 billion parameter model with 16 experts, and Llama 4 Maverick, a 17 billion parameter model with 128 experts.
+  overrides:
+    parameters:
+      model: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
+  files:
+    - filename: meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
+      sha256: 48dfc18d40691b4190b7fecf1f89b78cadc758c3a27a9e2a1cabd686fdb822e3
+      uri: huggingface://bartowski/meta-llama_Llama-4-Scout-17B-16E-Instruct-GGUF/meta-llama_Llama-4-Scout-17B-16E-Instruct-Q3_K_S.gguf
 - &eurollm
   name: "eurollm-9b-instruct"
   icon: https://openeurollm.eu/_next/static/media/logo-dark.e7001867.svg

From b4df1c9cf3488a395365ff4be2ac68e650556587 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 10:12:42 +0200
Subject: [PATCH 49/59] fix(gemma): improve prompt for tool calls (#5142)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/gemma.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gallery/gemma.yaml b/gallery/gemma.yaml
index 812e254a..ed69795f 100644
--- a/gallery/gemma.yaml
+++ b/gallery/gemma.yaml
@@ -8,9 +8,7 @@ config_file: |
     chat_message: |-
       <start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
       {{ if .FunctionCall -}}
-      Function call:
       {{ else if eq .RoleName "tool" -}}
-      Function response:
       {{ end -}}
       {{ if .Content -}}
       {{.Content -}}
@@ -25,11 +23,14 @@ config_file: |
       {{.Input}}
     function: |
       <start_of_turn>system
-      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+      You have access to functions. If you decide to invoke any of the function(s),
+      you MUST put it in the format of
+      {"name": function name, "parameters": dictionary of argument name and its value}
+
+      You SHOULD NOT include any other text in the response if you call a function
       {{range .Functions}}
       {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
       {{end}}
-      For each function call return a json object with function name and arguments
       <end_of_turn>
       {{.Input -}}
       <start_of_turn>model

From 25e6f21322983b2b33206900eae55141fcf4fbe2 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 11:26:06 +0200
Subject: [PATCH 50/59] chore(deps): bump llama.cpp to
 `4ccea213bc629c4eef7b520f7f6c59ce9bbdaca0` (#5143)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 Makefile                   | 2 +-
 backend/cpp/llama/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 66532c43..be58e0c4 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=916c83bfe7f8b08ada609c3b8e583cf5301e594b
+CPPLLAMA_VERSION?=4ccea213bc629c4eef7b520f7f6c59ce9bbdaca0
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/Makefile b/backend/cpp/llama/Makefile
index e36dc7c2..24db9356 100644
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
 
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)

From 081be3ba7dfd8d59b490c312c3426084a4a0994d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 22:04:14 +0200
Subject: [PATCH 51/59] chore(model gallery): add cogito-v1-preview-qwen-14b
 (#5145)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 96caade1..4fa84b1d 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5774,6 +5774,26 @@
     - filename: Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
       sha256: 86ccd640d72dcf3129fdd5b94381a733a684672b22487784e388b2ee9de57760
       uri: huggingface://mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF/Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "mensa-beta-14b-instruct-i1"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B
+    - https://huggingface.co/NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF
+  description: |
+      The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+      Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+      The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+      The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+          In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+      Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: cogito-v1-preview-qwen-14b-q4_k_m.gguf
+  files:
+    - filename: cogito-v1-preview-qwen-14b-q4_k_m.gguf
+      sha256: 42ddd667bac3e5f0989f52b3dca5767ed15d0e5077c6f537e4b3873862ff7096
+      uri: huggingface://NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF/cogito-v1-preview-qwen-14b-q4_k_m.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 2bab9b5fe23fd176fe6c3fcd443fcb98e9a47466 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 8 Apr 2025 22:15:32 +0200
Subject: [PATCH 52/59] fix: fix gallery name for cogito-v1-preview-qwen-14B

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 gallery/index.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 4fa84b1d..377a8d03 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5775,7 +5775,7 @@
       sha256: 86ccd640d72dcf3129fdd5b94381a733a684672b22487784e388b2ee9de57760
       uri: huggingface://mradermacher/Mensa-Beta-14B-Instruct-i1-GGUF/Mensa-Beta-14B-Instruct.i1-Q4_K_M.gguf
 - !!merge <<: *qwen25
-  name: "mensa-beta-14b-instruct-i1"
+  name: "cogito-v1-preview-qwen-14B"
   icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B/resolve/main/images/deep-cogito-logo.png
   urls:
     - https://huggingface.co/deepcogito/cogito-v1-preview-qwen-14B

From a7317d23bf469e58f15a6b702ee200f918691b4d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 9 Apr 2025 10:02:09 +0200
Subject: [PATCH 53/59] chore(model gallery): add
 deepcogito_cogito-v1-preview-llama-8b (#5147)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 377a8d03..573df8bf 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -7932,6 +7932,27 @@
     - filename: TextSynth-8B.i1-Q4_K_M.gguf
       sha256: 9186a8cb3a797cd2cd5b2eeaee99808674d96731824a9ee45685bbf480ba56c3
       uri: huggingface://mradermacher/TextSynth-8B-i1-GGUF/TextSynth-8B.i1-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "deepcogito_cogito-v1-preview-llama-8b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-8B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
+      sha256: 445173fb1dacef3fa0be49ebb4512b948fdb1434d86732de198424695b017b50
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF/deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf
 - !!merge <<: *llama33
   name: "llama-3.3-magicalgirl-2.5-i1"
   icon: https://cdn-uploads.huggingface.co/production/uploads/633e85093a17ab61de8d9073/FGK0qBGmELj6DEUxbbrdR.png

From 5a8a2adb440731874f9f0f2cc254de4e1b9ade1f Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Wed, 9 Apr 2025 15:39:04 +0200
Subject: [PATCH 54/59] chore: :arrow_up: Update ggml-org/llama.cpp to
 `b32efad2bc42460637c3a364c9554ea8217b3d7f` (#5146)

:arrow_up: Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index be58e0c4..40c47320 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 
 # llama.cpp versions
-CPPLLAMA_VERSION?=4ccea213bc629c4eef7b520f7f6c59ce9bbdaca0
+CPPLLAMA_VERSION?=b32efad2bc42460637c3a364c9554ea8217b3d7f
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp

From 673e59e76c52edfa7d53ec71c78db8473560f3da Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:42:53 +0200
Subject: [PATCH 55/59] chore(model gallery): add
 deepcogito_cogito-v1-preview-llama-3b (#5148)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 573df8bf..f194d1f1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -2629,6 +2629,27 @@
     - filename: Eximius_Persona_5B.Q4_K_M.gguf
       sha256: 8a8e7a0fa1068755322c51900e53423d795e57976b4d95982242cbec41141c7b
       uri: huggingface://mradermacher/Eximius_Persona_5B-GGUF/Eximius_Persona_5B.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "deepcogito_cogito-v1-preview-llama-3b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-3B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
+      sha256: 726a0ef5f818b8d238f2844f3204848bea66fb9c172b8ae0f6dc51b7bc081dd5
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF/deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
 - &qwen25
   name: "qwen2.5-14b-instruct" ## Qwen2.5
   icon: https://avatars.githubusercontent.com/u/141221163

From 270f0e21575d31d939822fd87a1390e68ae846bb Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:48:15 +0200
Subject: [PATCH 56/59] chore(model gallery): add
 deepcogito_cogito-v1-preview-qwen-32b (#5149)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index f194d1f1..23364fb1 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -5815,6 +5815,27 @@
     - filename: cogito-v1-preview-qwen-14b-q4_k_m.gguf
       sha256: 42ddd667bac3e5f0989f52b3dca5767ed15d0e5077c6f537e4b3873862ff7096
       uri: huggingface://NikolayKozloff/cogito-v1-preview-qwen-14B-Q4_K_M-GGUF/cogito-v1-preview-qwen-14b-q4_k_m.gguf
+- !!merge <<: *qwen25
+  name: "deepcogito_cogito-v1-preview-qwen-32b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-qwen-32B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF
+  description: |
+    The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+    Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+    The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+    The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+        In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+    Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
+      sha256: 985f2d49330090e64603309f7eb61030769f25a5da027ac0b0a740858d087ad8
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-qwen-32B-GGUF/deepcogito_cogito-v1-preview-qwen-32B-Q4_K_M.gguf
 - &llama31
   url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
   icon: https://avatars.githubusercontent.com/u/153379578

From 281e818047f3ae50b8218e59290fad7111512e55 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:53:28 +0200
Subject: [PATCH 57/59] chore(model gallery): add
 deepcogito_cogito-v1-preview-llama-70b (#5150)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 23364fb1..417178eb 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1516,6 +1516,27 @@
     - filename: watt-ai_watt-tool-70B-Q4_K_M.gguf
       sha256: 93806a5482b9e40e50ffca7a72abe3414d384749cc9e3d378eab5db8a8154b18
       uri: huggingface://bartowski/watt-ai_watt-tool-70B-GGUF/watt-ai_watt-tool-70B-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "deepcogito_cogito-v1-preview-llama-70b"
+  icon: https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B/resolve/main/images/deep-cogito-logo.png
+  urls:
+    - https://huggingface.co/deepcogito/cogito-v1-preview-llama-70B
+    - https://huggingface.co/bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF
+  description: |
+      The Cogito LLMs are instruction tuned generative models (text in/text out). All models are released under an open license for commercial use.
+
+          Cogito models are hybrid reasoning models. Each model can answer directly (standard LLM), or self-reflect before answering (like reasoning models).
+          The LLMs are trained using Iterated Distillation and Amplification (IDA) - an scalable and efficient alignment strategy for superintelligence using iterative self-improvement.
+          The models have been optimized for coding, STEM, instruction following and general helpfulness, and have significantly higher multilingual, coding and tool calling capabilities than size equivalent counterparts.
+              In both standard and reasoning modes, Cogito v1-preview models outperform their size equivalent counterparts on common industry benchmarks.
+          Each model is trained in over 30 languages and supports a context length of 128k.
+  overrides:
+    parameters:
+      model: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
+  files:
+    - filename: deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
+      sha256: d1deaf80c649e2a9446463cf5e1f7c026583647f46e3940d2b405a57cc685225
+      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-70B-GGUF/deepcogito_cogito-v1-preview-llama-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"

From a69e30e0c96313e5d57ec3c4e7507a086182eb7d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 9 Apr 2025 16:55:47 +0200
Subject: [PATCH 59/59] chore(model gallery): add
 agentica-org_deepcoder-14b-preview (#5151)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/gallery/index.yaml b/gallery/index.yaml
index 417178eb..4d307856 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -8494,6 +8494,20 @@
     - filename: Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
       sha256: aed6bd5bb03b7bd886939237bc10ea6331d4feb5a3b6712e0c5474a778acf817
       uri: huggingface://mradermacher/Fallen-Safeword-70B-R1-v4.1-GGUF/Fallen-Safeword-70B-R1-v4.1.Q4_K_M.gguf
+- !!merge <<: *deepseek-r1
+  name: "agentica-org_deepcoder-14b-preview"
+  urls:
+    - https://huggingface.co/agentica-org/DeepCoder-14B-Preview
+    - https://huggingface.co/bartowski/agentica-org_DeepCoder-14B-Preview-GGUF
+  description: |
+    DeepCoder-14B-Preview is a code reasoning LLM fine-tuned from DeepSeek-R1-Distilled-Qwen-14B using distributed reinforcement learning (RL) to scale up to long context lengths. The model achieves 60.6% Pass@1 accuracy on LiveCodeBench v5 (8/1/24-2/1/25), representing a 8% improvement over the base model (53%) and achieving similar performance to OpenAI's o3-mini with just 14B parameters.
+  overrides:
+    parameters:
+      model: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
+  files:
+    - filename: agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
+      sha256: 38f0f777de3116ca27d10ec84388b3290a1bf3f7db8c5bdc1f92d100e4231870
+      uri: huggingface://bartowski/agentica-org_DeepCoder-14B-Preview-GGUF/agentica-org_DeepCoder-14B-Preview-Q4_K_M.gguf
 - &qwen2
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ## Start QWEN2
   name: "qwen2-7b-instruct"