docs: update to include installer and update advanced YAML options (#2631)

* docs: update quickstart and advanced sections Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs: improvements Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * examples(kubernete): add nvidia example Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-22 11:35:00 +00:00 · 2024-06-22 12:00:38 +02:00 · 2024-06-22 12:00:38 +02:00 · 9a7ad75bff
commit 9a7ad75bff
parent 9fb3e4040b
11 changed files with 667 additions and 447 deletions
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@ -106,118 +106,202 @@ local-ai github://mudler/LocalAI/examples/configurations/phi-2.yaml@master
 ### Full config model file reference

 ```yaml
-# Model name.
-# The model name is used to identify the model in the API calls.
-name: gpt-3.5-turbo
+# Main configuration of the model, template, and system features.
+name: "" # Model name, used to identify the model in API calls.

-# Default model parameters.
-# These options can also be specified in the API calls
-parameters:
-  # Relative to the models path
-  model: luna-ai-llama2-uncensored.ggmlv3.q5_K_M.bin
-  # temperature
-  temperature: 0.3
-  # all the OpenAI request options here..
-  top_k: 
-  top_p: 
-  max_tokens:
-  ignore_eos: true
-  n_keep: 10
-  seed: 
-  mode: 
-  step:
-  negative_prompt:
-  typical_p:
-  tfz:
-  frequency_penalty:
+# Precision settings for the model, reducing precision can enhance performance on some hardware.
+f16: null # Whether to use 16-bit floating-point precision.

-  rope_freq_base:
-  rope_freq_scale:
-  negative_prompt_scale:
+# Concurrency settings for the application.
+threads: null # Number of threads to use for processing.

-mirostat_eta:
-mirostat_tau:
-mirostat: 
-# Default context size
-context_size: 512
-# Default number of threads
-threads: 10
-# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
-backend: llama-stable # available: llama, stablelm, gpt2, gptj rwkv
-# stopwords (if supported by the backend)
-stopwords:
- "HUMAN:"
- "### Response:"
-# string to trim space to
-trimspace:
- string
-# Strings to cut from the response
-cutstrings:
- "string"
+# Roles define how different entities interact in a conversational model.
+# It can be used to map roles to specific parts of the conversation.
+roles: {} # Roles for entities like user, system, assistant, etc.

-# Directory used to store additional assets
-asset_dir: ""
+# Backend to use for computation (like llama-cpp, diffusers, whisper).
+backend: "" # Backend for AI computations.

-# define chat roles
-roles:
-  user: "HUMAN:"
-  system: "GPT:"
-  assistant: "ASSISTANT:"
+# Templates for various types of model interactions.
 template:
-  # template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
-  completion: completion
-  chat: chat
-  edit: edit_template
-  function: function_template
+    chat: "" # Template for chat interactions. Uses golang templates with Sprig functions.
+    chat_message: "" # Template for individual chat messages.  Uses golang templates with Sprig functions.
+    completion: "" # Template for generating text completions. Uses golang templates with Sprig functions.
+    edit: "" # Template for edit operations. Uses golang templates with Sprig functions.
+    function: "" # Template for function calls. Uses golang templates with Sprig functions.
+    use_tokenizer_template: false # Whether to use a specific tokenizer template. (vLLM)
+    join_chat_messages_by_character: null # Character to join chat messages, if applicable. Defaults to newline.

+# Function-related settings to control behavior of specific function calls.
 function:
-   disable_no_action: true
-   no_action_function_name: "reply"
-   no_action_description_name: "Reply to the AI assistant"
+    disable_no_action: false # Whether to disable the no-action behavior.
+    grammar:
+        parallel_calls: false # Allow to return parallel tools
+        disable_parallel_new_lines: false # Disable parallel processing for new lines in grammar checks.
+        mixed_mode: false # Allow mixed-mode grammar enforcing
+        no_mixed_free_string: false # Disallow free strings in mixed mode.
+        disable: false # Completely disable grammar enforcing functionality.
+        prefix: "" # Prefix to add before grammars rules.
+        expect_strings_after_json: false # Expect string after JSON data.
+    no_action_function_name: "" # Function name to call when no action is determined.
+    no_action_description_name: "" # Description name for no-action functions.
+    response_regex: [] # Regular expressions to match response from
+    json_regex_match: [] # Regular expressions to match JSON data when in tool mode
+    replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
+    replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
+    capture_llm_results: [] # Capture language model results as text result, among JSON, in function calls. For instance, if a model returns a block for "thinking" and a block for "response", this will allow you to capture the thinking block.
+    return_name_in_function_response: false # Some models might prefer to use "name" rather then "function" when returning JSON data. This will allow to use "name" as a key in the JSON response.

-system_prompt:
-rms_norm_eps:
-# Set it to 8 for llama2 70b
-ngqa: 1
-## LLAMA specific options
-# Enable F16 if backend supports it
-f16: true
-# Enable debugging
-debug: true
-# Enable embeddings
-embeddings: true
-# Mirostat configuration (llama.cpp only)
-mirostat_eta: 0.8
-mirostat_tau: 0.9
-mirostat: 1
-# GPU Layers (only used when built with cublas)
-gpu_layers: 22
-# Enable memory lock
-mmlock: true
-# GPU setting to split the tensor in multiple parts and define a main GPU
-# see llama.cpp for usage
+# Feature gating flags to enable experimental or optional features.
+feature_flags: {}
+
+# System prompt to use by default.
+system_prompt: ""
+
+# Configuration for splitting tensors across GPUs.
 tensor_split: ""
-main_gpu: ""
-# Define a prompt cache path (relative to the models)
-prompt_cache_path: "prompt-cache"
-# Cache all the prompts
-prompt_cache_all: true
-# Read only
-prompt_cache_ro: false
-# Enable mmap
-mmap: true
-# Enable low vram mode (GPU only)
-low_vram: true
-# Set NUMA mode (CPU only)
-numa: true
-# Lora settings
-lora_adapter: "/path/to/lora/adapter"
-lora_base: "/path/to/lora/base"
-# Disable mulmatq (CUDA)
-no_mulmatq: true

-# Diffusers/transformers
-cuda: true
+# Identifier for the main GPU used in multi-GPU setups.
+main_gpu: ""
+
+# Small value added to the denominator in RMS normalization to prevent division by zero.
+rms_norm_eps: 0
+
+# Natural question generation model parameter.
+ngqa: 0
+
+# Path where prompt cache is stored.
+prompt_cache_path: ""
+
+# Whether to cache all prompts.
+prompt_cache_all: false
+
+# Whether the prompt cache is read-only.
+prompt_cache_ro: false
+
+# Mirostat sampling settings.
+mirostat_eta: null
+mirostat_tau: null
+mirostat: null
+
+# GPU-specific layers configuration.
+gpu_layers: null
+
+# Memory mapping for efficient I/O operations.
+mmap: null
+
+# Memory locking to ensure data remains in RAM.
+mmlock: null
+
+# Mode to use minimal VRAM for GPU operations.
+low_vram: null
+
+# Words or phrases that halts processing.
+stopwords: []
+
+# Strings to cut from responses to maintain context or relevance.
+cutstrings: []
+
+# Strings to trim from responses for cleaner outputs.
+trimspace: []
+trimsuffix: []
+
+# Default context size for the model's understanding of the conversation or text.
+context_size: null
+
+# Non-uniform memory access settings, useful for systems with multiple CPUs.
+numa: false
+
+# Configuration for LoRA
+lora_adapter: ""
+lora_base: ""
+lora_scale: 0
+
+# Disable matrix multiplication queuing in GPU operations.
+no_mulmatq: false
+
+# Model for generating draft responses.
+draft_model: ""
+n_draft: 0
+
+# Quantization settings for the model, impacting memory and processing speed.
+quantization: ""
+
+# Utilization percentage of GPU memory to allocate for the model. (vLLM)
+gpu_memory_utilization: 0
+
+# Whether to trust and execute remote code.
+trust_remote_code: false
+
+# Force eager execution of TensorFlow operations if applicable. (vLLM)
+enforce_eager: false
+
+# Space allocated for swapping data in and out of memory. (vLLM)
+swap_space: 0
+
+# Maximum model length, possibly referring to the number of tokens or parameters. (vLLM)
+max_model_len: 0
+
+# Size of the tensor parallelism in distributed computing environments. (vLLM)
+tensor_parallel_size: 0
+
+# vision model to use for multimodal
+mmproj: ""
+
+# Disables offloading of key/value pairs in transformer models to save memory.
+no_kv_offloading: false
+
+# Scaling factor for the rope penalty.
+rope_scaling: ""
+
+# Type of configuration, often related to the type of task or model architecture.
+type: ""
+
+# YARN settings
+yarn_ext_factor: 0
+yarn_attn_factor: 0
+yarn_beta_fast: 0
+yarn_beta_slow: 0
+
+# AutoGPT-Q settings, for configurations specific to GPT models.
+autogptq:
+    model_base_name: "" # Base name of the model.
+    device: "" # Device to run the model on.
+    triton: false # Whether to use Triton Inference Server.
+    use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
+
+# configuration for diffusers model
+diffusers:
+    cuda: false # Whether to use CUDA
+    pipeline_type: "" # Type of pipeline to use.
+    scheduler_type: "" # Type of scheduler for controlling operations.
+    enable_parameters: "" # Parameters to enable in the diffuser.
+    cfg_scale: 0 # Scale for CFG in the diffuser setup.
+    img2img: false # Whether image-to-image transformation is supported.
+    clip_skip: 0 # Number of steps to skip in CLIP operations.
+    clip_model: "" # Model to use for CLIP operations.
+    clip_subfolder: "" # Subfolder for storing CLIP-related data.
+    control_net: "" # Control net to use
+
+# Step count, usually for image processing models
+step: 0
+
+# Configuration for gRPC communication.
+grpc:
+    attempts: 0 # Number of retry attempts for gRPC calls.
+    attempts_sleep_time: 0 # Sleep time between retries.
+
+# Text-to-Speech (TTS) configuration.
+tts:
+    voice: "" # Voice setting for TTS.
+    vall-e:
+        audio_path: "" # Path to audio files for Vall-E.
+
+# Whether to use CUDA for GPU-based operations.
+cuda: false
+
+# List of files to download as part of the setup or operations.
+download_files: []
 ```

 ### Prompt templates 
--- a/docs/content/docs/advanced/installer.md
+++ b/docs/content/docs/advanced/installer.md
@ -0,0 +1,33 @@
+
+++
+disableToc = false
+title = "Installer options"
+weight = 24
+++
+
+An installation script is available for quick and hassle-free installations, streamlining the setup process for new users.
+
+Can be used with the following command:
+```bash
+curl https://localai.io/install.sh | sh
+```
+
+Installation can be configured with Environment variables, for example: 
+
+```bash
+curl https://localai.io/install.sh | VAR=value sh
+```
+
+List of the Environment Variables:
+| Environment Variable | Description                                                  |
+|----------------------|--------------------------------------------------------------|
+| **DOCKER_INSTALL**       | Set to "true" to enable the installation of Docker images.    |
+| **USE_AIO**              | Set to "true" to use the all-in-one LocalAI Docker image.    |
+| **API_KEY**              | Specify an API key for accessing LocalAI, if required.       |
+| **CORE_IMAGES**          | Set to "true" to download core LocalAI images.                |
+| **PORT**                 | Specifies the port on which LocalAI will run (default is 8080). |
+| **THREADS**              | Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
+| **VERSION**              | Specifies the version of LocalAI to install. Defaults to the latest available version. |
+| **MODELS_PATH**          | Directory path where LocalAI models are stored (default is /usr/share/local-ai/models). |
+
+We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!
--- a/docs/content/docs/advanced/run-other-models.md
+++ b/docs/content/docs/advanced/run-other-models.md
@ -0,0 +1,126 @@
+++
+disableToc = false
+title = "Run other Models"
+weight = 23
+icon = "rocket_launch"
+
+++
+
+## Running other models
+
+> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}})_.
+
+To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/manual" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
+
+To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
+
+There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
+
+{{% alert icon="💡" %}}
+
+To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
+{{% /alert %}}
+
+{{< tabs tabTotal="3" >}}
+{{% tab tabName="CPU-only" %}}
+
+> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X)  | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+{{% tab tabName="GPU (CUDA 11)" %}}
+
+
+> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) |  ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+
+{{% tab tabName="GPU (CUDA 12)" %}}
+
+> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
+| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
+| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
+| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
+| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
+| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
+| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
+| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
+| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
+| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
+| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
+| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
+| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
+| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
+| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
+| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
+| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
+| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}})  | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
+| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
+{{% /tab %}}
+
+{{< /tabs >}}
+
+{{% alert icon="💡" %}}
+**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
+
+```bash
+docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
+```
+
+{{% /alert %}}