chore: drop embedded models (#4715)

Since the remote gallery was introduced this is now completely superseded by it. In order to keep the code clean and remove redudant parts let's simplify the usage. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-19 18:15:00 +00:00 · 2025-01-30 00:03:01 +01:00 · 2025-01-30 00:03:01 +01:00 · 72e52c4f6a
commit 72e52c4f6a
parent 1656e1a88e
39 changed files with 8 additions and 986 deletions
--- a/2
+++ b/2
@ -861,7 +861,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}
 	}
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
 		log.Error().Err(err).Msg("error installing models")
 	}
--- a/core/cli/models.go
+++ b/core/cli/models.go
@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
 		}
-		err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
+		err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
 		if err != nil {
 			return err
 		}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -32,7 +32,6 @@ type RunCMD struct {
 	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
 	Models              []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@ -90,7 +89,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithCsrf(r.CSRF),
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@ -44,8 +44,6 @@ type ApplicationConfig struct {
 	DisableGalleryEndpoint             bool
 	LoadToMemory                       []string
 	ModelLibraryURL string
 	Galleries []Gallery
 	BackendAssets     embed.FS
@ -126,12 +124,6 @@ func WithP2PToken(s string) AppOption {
 	}
 }
 func WithModelLibraryURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }
 func WithLibPath(path string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.LibPath = path
--- a/core/services/gallery.go
+++ b/core/services/gallery.go
@ -129,7 +129,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.BackendConfigLoader
 					if op.GalleryModelName != "" {
 						err = gallery.InstallModelFromGallery(op.Galleries, op.GalleryModelName, g.appConfig.ModelPath, op.Req, progressCallback, g.appConfig.EnforcePredownloadScans)
 					} else if op.ConfigURL != "" {
-						err = startup.InstallModels(op.Galleries, op.ConfigURL, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
+						err = startup.InstallModels(op.Galleries, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
 						if err != nil {
 							updateError(err)
 							continue
--- a/docs/content/docs/advanced/run-other-models.md
+++ b/docs/content/docs/advanced/run-other-models.md
@ -1,126 +0,0 @@
 +++
 disableToc = false
 title = "Run other Models"
 weight = 23
 icon = "rocket_launch"
 +++
 ## Running other models
 > _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/models" %}})_.
 To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/models" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
 To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
 There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
 {{% alert icon="💡" %}}
 To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI-examples/tree/main/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
 {{% /alert %}}
 {{< tabs tabTotal="3" >}}
 {{% tab tabName="CPU-only" %}}
 > 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X)  | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{% tab tabName="GPU (CUDA 11)" %}}
 > To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) |  ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{% tab tabName="GPU (CUDA 12)" %}}
 > To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
 | Model | Category | Docker command |
 | --- | --- | --- |
 | [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
 | 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
 | 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
 | 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
 | 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
 | [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
 | [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
 | [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
 | whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
 | rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
 | 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
 | 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
 | 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
 | mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
 | [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
 | [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
 | 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
 | animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
 | transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
 | [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
 | [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
 | [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
 {{% /tab %}}
 {{< /tabs >}}
 {{% alert icon="💡" %}}
 **Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
 ```bash
 docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
 ```
 {{% /alert %}}
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@ -143,7 +143,7 @@ The AIO Images are inheriting the same environment variables as the base images
 | Variable | Default | Description |
 | ---------------------| ------- | ----------- |
 | `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
-| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/advanced/run-other-models" %}})) |
+| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/models" %}})) |
 ## Standard container images
--- a/embedded/embedded.go
+++ b/embedded/embedded.go
@ -1,72 +0,0 @@
 package embedded
 import (
 	"embed"
 	"fmt"
 	"slices"
 	"strings"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/pkg/assets"
 	"gopkg.in/yaml.v3"
 )
 var modelShorteners map[string]string
 //go:embed model_library.yaml
 var modelLibrary []byte
 //go:embed models/*
 var embeddedModels embed.FS
 func ModelShortURL(s string) string {
 	if _, ok := modelShorteners[s]; ok {
 		s = modelShorteners[s]
 	}
 	return s
 }
 func init() {
 	err := yaml.Unmarshal(modelLibrary, &modelShorteners)
 	if err != nil {
 		log.Error().Err(err).Msg("error while unmarshalling embedded modelLibrary")
 	}
 }
 func GetRemoteLibraryShorteners(url string, basePath string) (map[string]string, error) {
 	remoteLibrary := map[string]string{}
 	uri := downloader.URI(url)
 	err := uri.DownloadWithCallback(basePath, func(_ string, i []byte) error {
 		return yaml.Unmarshal(i, &remoteLibrary)
 	})
 	if err != nil {
 		return nil, fmt.Errorf("error downloading remote library: %s", err.Error())
 	}
 	return remoteLibrary, err
 }
 // ExistsInModelsLibrary checks if a model exists in the embedded models library
 func ExistsInModelsLibrary(s string) bool {
 	f := fmt.Sprintf("%s.yaml", s)
 	a := []string{}
 	for _, j := range assets.ListFiles(embeddedModels) {
 		a = append(a, strings.TrimPrefix(j, "models/"))
 	}
 	return slices.Contains(a, f)
 }
 // ResolveContent returns the content in the embedded model library
 func ResolveContent(s string) ([]byte, error) {
 	if ExistsInModelsLibrary(s) {
 		return embeddedModels.ReadFile(fmt.Sprintf("models/%s.yaml", s))
 	}
 	return nil, fmt.Errorf("cannot find model %s", s)
 }
--- a/embedded/model_library.yaml
+++ b/embedded/model_library.yaml
@ -1,9 +0,0 @@
 ### 
 ###
 ### This file contains the list of models that are available in the library
 ### The URLs are automatically expanded when local-ai is being called with the key as argument
 ###
 ### For models with an entire YAML file to be embededd, put the file inside the `models`
 ### directory, it will be automatically available with the file name as key (without the .yaml extension)
 phi-2:  "github://mudler/LocalAI-examples/configurations/phi-2.yaml@main"
--- a/embedded/models/all-minilm-l6-v2.yaml
+++ b/embedded/models/all-minilm-l6-v2.yaml
@ -1,13 +0,0 @@
 name: all-minilm-l6-v2
 backend: sentencetransformers
 embeddings: true
 parameters:
  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
      "input": "Your text string goes here",
      "model": "all-minilm-l6-v2"
    }'
--- a/embedded/models/animagine-xl.yaml
+++ b/embedded/models/animagine-xl.yaml
@ -1,17 +0,0 @@
 name: animagine-xl
 parameters:
  model: Linaqruf/animagine-xl
 backend: diffusers
 f16: true
 diffusers:
  scheduler_type: euler_a
 usage: |
        curl http://localhost:8080/v1/images/generations \
          -H "Content-Type: application/json" \
          -d '{
            "prompt": "<positive prompt>|<negative prompt>",
            "model": "animagine-xl",
            "step": 51,
            "size": "1024x1024"
          }'
--- a/embedded/models/bakllava.yaml
+++ b/embedded/models/bakllava.yaml
@ -1,40 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: bakllava
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
  model: bakllava.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: bakllava.gguf
  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
 - filename: bakllava-mmproj.gguf
  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "bakllava",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/bark.yaml
+++ b/embedded/models/bark.yaml
@ -1,8 +0,0 @@
 usage: |
    bark works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
     "backend": "bark",
     "input":"Hello, this is a test!"
    }' | aplay
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/cerbero.yaml
+++ b/embedded/models/cerbero.yaml
@ -1,24 +0,0 @@
 backend: llama
 context_size: 8192
 f16: false
 gpu_layers: 90
 name: cerbero
 mmap: false
 parameters:
  model: huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q8_0.gguf
  top_k: 80
  temperature: 0.2
  top_p: 0.7
 template:
  completion: "{{.Input}}"
  chat: "Questa è una conversazione tra un umano ed un assistente AI.\n{{.Input}}\n[|Assistente|]  "
 roles:
  user: "[|Umano|] "
  system: "[|Umano|] "
  assistant: "[|Assistente|] "
 stopwords:
 - "[|Umano|]"
 trimsuffix: 
 - "\n"
--- a/embedded/models/codellama-7b-gguf.yaml
+++ b/embedded/models/codellama-7b-gguf.yaml
@ -1,20 +0,0 @@
 name: codellama-7b-gguf
 backend: transformers
 parameters:
  model: huggingface://TheBloke/CodeLlama-7B-GGUF/codellama-7b.Q4_K_M.gguf
  temperature: 0.5
  top_k: 40
  seed: -1
  top_p: 0.95
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 context_size: 4096
 f16: true
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "codellama-7b-gguf",
          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
      }'
--- a/embedded/models/codellama-7b.yaml
+++ b/embedded/models/codellama-7b.yaml
@ -1,14 +0,0 @@
 name: codellama-7b
 backend: transformers
 type: AutoModelForCausalLM
 parameters:
  model: codellama/CodeLlama-7b-hf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "codellama-7b",
          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
      }'
--- a/embedded/models/coqui.yaml
+++ b/embedded/models/coqui.yaml
@ -1,9 +0,0 @@
 usage: |
    coqui works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
        "backend": "coqui",
        "model": "tts_models/en/ljspeech/glow-tts",
        "input":"Hello, this is a test!"
        }'
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
+++ b/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
@ -1,31 +0,0 @@
 name: dolphin-mixtral-8x7b
 mmap: true
 parameters:
  model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
  temperature: 0.5
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "dolphin-mixtral-8x7b",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/hermes-2-pro-mistral.yaml
+++ b/embedded/models/hermes-2-pro-mistral.yaml
@ -1,59 +0,0 @@
 name: hermes-2-pro-mistral
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
    <tool_call>
    {{- else if eq .RoleName "tool" }}
    <tool_response>
    {{- end }}
    {{- if .Content}}
    {{.Content }}
    {{- end }}
    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
    Use the following pydantic model json schema for each tool call you will make:
    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
    <tool_call>
    {'arguments': <args-dict>, 'name': <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
    <tool_call>
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 - "\n</tool_call>"
 - "\n\n\n"
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "hermes-2-pro-mistral",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/llama3-instruct.yaml
+++ b/embedded/models/llama3-instruct.yaml
@ -1,48 +0,0 @@
 name: llama3-8b-instruct
 mmap: true
 parameters:
  model: huggingface://second-state/Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf
 template:
  chat_message: |
    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content -}}
    {{ else if .FunctionCall -}}
    {{ toJson .FunctionCall -}}
    {{ end -}}
    <|eot_id|>
  function: |
    <|start_header_id|>system<|end_header_id|>
    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
    Use the following pydantic model json schema for each tool call you will make:
    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    Function call:
  chat: |
    <|begin_of_text|>{{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  completion: |
    {{.Input}}
 context_size: 8192
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 - "<|eot_id|>"
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "llama3-8b-instruct",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/llava-1.5.yaml
+++ b/embedded/models/llava-1.5.yaml
@ -1,33 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.5
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.5-7b-mmproj-Q8_0.gguf
 parameters:
  model: llava-v1.5-7b-Q4_K.gguf
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: llava-v1.5-7b-Q4_K.gguf
  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-Q4_K.gguf
 - filename: llava-v1.5-7b-mmproj-Q8_0.gguf
  uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-Q8_0.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.5",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-mistral.yaml
+++ b/embedded/models/llava-1.6-mistral.yaml
@ -1,33 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.6-mistral
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
  model: llava-v1.6-mistral-7b.gguf
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: llava-v1.6-mistral-7b.gguf
  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q6_K.gguf
 - filename: llava-v1.6-7b-mmproj-f16.gguf
  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.6-mistral",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava-1.6-vicuna.yaml
+++ b/embedded/models/llava-1.6-vicuna.yaml
@ -1,37 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava-1.6-vicuna
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: mmproj-vicuna7b-f16.gguf
 parameters:
  model: vicuna-7b-q5_k.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: vicuna-7b-q5_k.gguf
  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
 - filename: mmproj-vicuna7b-f16.gguf
  uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava-1.6-vicuna",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/llava.yaml
+++ b/embedded/models/llava.yaml
@ -1,40 +0,0 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 gpu_layers: 90
 mmap: true
 name: llava
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
  model: bakllava.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: |
    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    {{.Input}}
    ASSISTANT:
 download_files:
 - filename: bakllava.gguf
  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
 - filename: bakllava-mmproj.gguf
  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "llava",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/embedded/models/mamba-bagel.yaml
+++ b/embedded/models/mamba-bagel.yaml
@ -1,21 +0,0 @@
 name: bagel
 backend: mamba
 parameters:
  model: "jondurbin/bagel-dpo-2.8b-v0.2"
 systemPrompt: "You are a helpful, unbiased, uncensored assistant."
 template:
  chat_message: |
    {{if eq .RoleName "assistant"}}{{.Content}}{{else}}
    [INST]
    {{if .SystemPrompt}}{{.SystemPrompt}}{{else if eq .RoleName "system"}}<<SYS>>{{.Content}}<</SYS>>
    {{else if .Content}}{{.Content}}{{end}}
    [/INST]
    {{end}}
  completion: |
    {{.Input}}
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
      "model": "bagel",
      "messages": [{"role": "user", "content": "how are you doing"}],
    }'
--- a/embedded/models/mamba-chat.yaml
+++ b/embedded/models/mamba-chat.yaml
@ -1,28 +0,0 @@
 name: mamba-chat
 backend: mamba
 parameters:
  model: "havenhq/mamba-chat"
 trimsuffix: 
 - <|endoftext|>
 # https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json
 #   "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
 template:
  chat_message: |
    {{if eq .RoleName "assistant"}}<|assistant|>{{else if eq .RoleName "system"}}<|system|>{{else if eq .RoleName "user"}}<|user|>{{end}}
    {{if .Content}}{{.Content}}{{end}}
    </s>
  chat: |
    {{.Input}}
    <|assistant|>
  completion: |
    {{.Input}}
 usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
      "model": "mamba-chat",
      "messages": [{"role": "user", "content": "how are you doing"}],
      "temperature": 0.7
    }'
--- a/embedded/models/mistral-openorca.yaml
+++ b/embedded/models/mistral-openorca.yaml
@ -1,32 +0,0 @@
 name: mistral-openorca
 mmap: true
 parameters:
  model: huggingface://TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q6_K.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  seed: -1
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "mistral-openorca",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/mixtral-instruct.yaml
+++ b/embedded/models/mixtral-instruct.yaml
@ -1,25 +0,0 @@
 name: mixtral-instruct
 mmap: true
 parameters:
  model: huggingface://TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
  temperature: 0.2
  top_k: 40
  seed: -1
  top_p: 0.95
 mirostat: 2
 mirostat_eta: 1.0
 mirostat_tau: 1.0
 template:
  chat: &chat |
    [INST] {{.Input}} [/INST]    
  completion: *chat
 context_size: 4096
 f16: true
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
          "model": "mixtral-instruct",
          "prompt": "How are you doing?"
      }'
--- a/embedded/models/phi-2-chat.yaml
+++ b/embedded/models/phi-2-chat.yaml
@ -1,25 +0,0 @@
 name: phi-2-chat
 mmap: true
 parameters:
  model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "phi-2-chat",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/phi-2-orange.yaml
+++ b/embedded/models/phi-2-orange.yaml
@ -1,30 +0,0 @@
 name: phi-2-orange
 mmap: true
 parameters:
  model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}
    <|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 - <dummy32000>
 description: |
  This model is a chatbot that can be used for general conversation.
  [Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "phi-2-orange",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/rhasspy-voice-en-us-amy.yaml
+++ b/embedded/models/rhasspy-voice-en-us-amy.yaml
@ -1,13 +0,0 @@
 name: voice-en-us-amy-low
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
 usage: |
    To test if this model works as expected, you can use the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
      "model":"en-us-amy-low.onnx",
      "input": "Hi, this is a test."
    }'
--- a/embedded/models/tinyllama-chat.yaml
+++ b/embedded/models/tinyllama-chat.yaml
@ -1,29 +0,0 @@
 name: tinyllama-chat
 mmap: true
 parameters:
  model: huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q8_0.gguf
  temperature: 0.2
  top_k: 40
  seed: -1
  top_p: 0.95
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
 gpu_layers: 90
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
          "model": "tinyllama-chat",
          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
      }'
--- a/embedded/models/transformers-tinyllama.yaml
+++ b/embedded/models/transformers-tinyllama.yaml
@ -1,31 +0,0 @@
 name: tinyllama-chat
 backend: transformers
 type: AutoModelForCausalLM
 parameters:
  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
  temperature: 0.2
  top_k: 40
  top_p: 0.95
  max_tokens: 4096
 template:
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
    {{if .Content}}{{.Content}}{{end}}<|im_end|>
  chat: |
    {{.Input}}
    <|im_start|>assistant
  completion: |
    {{.Input}}
 stopwords:
 - <|im_end|>
 usage: |
      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "tinyllama-chat",
        "messages": [{"role": "user", "content": "Say this is a test!"}],
        "temperature": 0.7
      }'
--- a/embedded/models/vall-e-x.yaml
+++ b/embedded/models/vall-e-x.yaml
@ -1,8 +0,0 @@
 usage: |
    Vall-e-x works without any configuration, to test it, you can run the following curl command:
    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
     "backend": "vall-e-x",
     "input":"Hello, this is a test!"
    }' | aplay
 # TODO: This is a placeholder until we manage to pre-load HF/Transformers models
--- a/embedded/models/whisper-base.yaml
+++ b/embedded/models/whisper-base.yaml
@ -1,18 +0,0 @@
 name: whisper
 backend: whisper
 parameters:
  model: ggml-whisper-base.bin
 usage: |
    ## example audio file
    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
    ## Send the example audio file to the transcriptions endpoint
    curl http://localhost:8080/v1/audio/transcriptions \
         -H "Content-Type: multipart/form-data" \
         -F file="@$PWD/gb1.ogg" -F model="whisper"
 download_files:
 - filename: "ggml-whisper-base.bin"
  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/pkg/startup/model_preload.go
+++ b/pkg/startup/model_preload.go
@ -9,7 +9,6 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/embedded"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
@ -18,42 +17,17 @@ import (
 // InstallModels will preload models from the given list of URLs and galleries
 // It will download the model if it is not already present in the model path
 // It will also try to resolve if the model is an embedded model YAML configuration
-func InstallModels(galleries []config.Gallery, modelLibraryURL string, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
+func InstallModels(galleries []config.Gallery, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
 	// create an error that groups all errors
 	var err error
 	lib, _ := embedded.GetRemoteLibraryShorteners(modelLibraryURL, modelPath)
 	for _, url := range models {
 		// As a best effort, try to resolve the model from the remote library
 		// if it's not resolved we try with the other method below
 		if modelLibraryURL != "" {
 			if lib[url] != "" {
 				log.Debug().Msgf("[startup] model configuration is defined remotely: %s (%s)", url, lib[url])
 				url = lib[url]
 			}
 		}
 		url = embedded.ModelShortURL(url)
 		uri := downloader.URI(url)
 		switch {
 		case embedded.ExistsInModelsLibrary(url):
 			modelYAML, e := embedded.ResolveContent(url)
 			// If we resolve something, just save it to disk and continue
 			if e != nil {
 				log.Error().Err(e).Msg("error resolving model content")
 				err = errors.Join(err, e)
 				continue
 			}
 			log.Debug().Msgf("[startup] resolved embedded model: %s", url)
 			md5Name := utils.MD5(url)
 			modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
 			if e := os.WriteFile(modelDefinitionFilePath, modelYAML, 0600); err != nil {
 				log.Error().Err(e).Str("filepath", modelDefinitionFilePath).Msg("error writing model definition")
 				err = errors.Join(err, e)
 			}
 		case uri.LooksLikeOCI():
 			log.Debug().Msgf("[startup] resolved OCI model to download: %s", url)
--- a/pkg/startup/model_preload_test.go
+++ b/pkg/startup/model_preload_test.go
@ -7,7 +7,6 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	. "github.com/mudler/LocalAI/pkg/startup"
 	"github.com/mudler/LocalAI/pkg/utils"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@ -16,29 +15,13 @@ import (
 var _ = Describe("Preload test", func() {
 	Context("Preloading from strings", func() {
 		It("loads from remote url", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			libraryURL := "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml"
 			fileName := fmt.Sprintf("%s.yaml", "phi-2")
 			InstallModels([]config.Gallery{}, libraryURL, tmpdir, true, nil, "phi-2")
 			resultFile := filepath.Join(tmpdir, fileName)
 			content, err := os.ReadFile(resultFile)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded full-urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml"
 			fileName := fmt.Sprintf("%s.yaml", "phi-2")
-			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
+			InstallModels([]config.Gallery{}, tmpdir, true, nil, url)
 			resultFile := filepath.Join(tmpdir, fileName)
@ -47,45 +30,13 @@ var _ = Describe("Preload test", func() {
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded short-urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "phi-2"
 			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
 			entry, err := os.ReadDir(tmpdir)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(entry).To(HaveLen(1))
 			resultFile := entry[0].Name()
 			content, err := os.ReadFile(filepath.Join(tmpdir, resultFile))
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: phi-2"))
 		})
 		It("loads from embedded models", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "mistral-openorca"
 			fileName := fmt.Sprintf("%s.yaml", utils.MD5(url))
 			InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
 			resultFile := filepath.Join(tmpdir, fileName)
 			content, err := os.ReadFile(resultFile)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(string(content)).To(ContainSubstring("name: mistral-openorca"))
 		})
 		It("downloads from urls", func() {
 			tmpdir, err := os.MkdirTemp("", "")
 			Expect(err).ToNot(HaveOccurred())
 			url := "huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
 			fileName := fmt.Sprintf("%s.gguf", "tinyllama-1.1b-chat-v0.3.Q2_K")
-			err = InstallModels([]config.Gallery{}, "", tmpdir, false, nil, url)
+			err = InstallModels([]config.Gallery{}, tmpdir, false, nil, url)
 			Expect(err).ToNot(HaveOccurred())
 			resultFile := filepath.Join(tmpdir, fileName)
--- a/embedded/webui_static.yaml
+++ b/embedded/webui_static.yaml