mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-22 11:35:00 +00:00
docs: update to include installer and update advanced YAML options (#2631)
* docs: update quickstart and advanced sections Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs: improvements Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * examples(kubernete): add nvidia example Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
9fb3e4040b
commit
9a7ad75bff
11 changed files with 667 additions and 447 deletions
|
@ -106,118 +106,202 @@ local-ai github://mudler/LocalAI/examples/configurations/phi-2.yaml@master
|
|||
### Full config model file reference
|
||||
|
||||
```yaml
|
||||
# Model name.
|
||||
# The model name is used to identify the model in the API calls.
|
||||
name: gpt-3.5-turbo
|
||||
# Main configuration of the model, template, and system features.
|
||||
name: "" # Model name, used to identify the model in API calls.
|
||||
|
||||
# Default model parameters.
|
||||
# These options can also be specified in the API calls
|
||||
parameters:
|
||||
# Relative to the models path
|
||||
model: luna-ai-llama2-uncensored.ggmlv3.q5_K_M.bin
|
||||
# temperature
|
||||
temperature: 0.3
|
||||
# all the OpenAI request options here..
|
||||
top_k:
|
||||
top_p:
|
||||
max_tokens:
|
||||
ignore_eos: true
|
||||
n_keep: 10
|
||||
seed:
|
||||
mode:
|
||||
step:
|
||||
negative_prompt:
|
||||
typical_p:
|
||||
tfz:
|
||||
frequency_penalty:
|
||||
# Precision settings for the model, reducing precision can enhance performance on some hardware.
|
||||
f16: null # Whether to use 16-bit floating-point precision.
|
||||
|
||||
rope_freq_base:
|
||||
rope_freq_scale:
|
||||
negative_prompt_scale:
|
||||
# Concurrency settings for the application.
|
||||
threads: null # Number of threads to use for processing.
|
||||
|
||||
mirostat_eta:
|
||||
mirostat_tau:
|
||||
mirostat:
|
||||
# Default context size
|
||||
context_size: 512
|
||||
# Default number of threads
|
||||
threads: 10
|
||||
# Define a backend (optional). By default it will try to guess the backend the first time the model is interacted with.
|
||||
backend: llama-stable # available: llama, stablelm, gpt2, gptj rwkv
|
||||
# stopwords (if supported by the backend)
|
||||
stopwords:
|
||||
- "HUMAN:"
|
||||
- "### Response:"
|
||||
# string to trim space to
|
||||
trimspace:
|
||||
- string
|
||||
# Strings to cut from the response
|
||||
cutstrings:
|
||||
- "string"
|
||||
# Roles define how different entities interact in a conversational model.
|
||||
# It can be used to map roles to specific parts of the conversation.
|
||||
roles: {} # Roles for entities like user, system, assistant, etc.
|
||||
|
||||
# Directory used to store additional assets
|
||||
asset_dir: ""
|
||||
# Backend to use for computation (like llama-cpp, diffusers, whisper).
|
||||
backend: "" # Backend for AI computations.
|
||||
|
||||
# define chat roles
|
||||
roles:
|
||||
user: "HUMAN:"
|
||||
system: "GPT:"
|
||||
assistant: "ASSISTANT:"
|
||||
# Templates for various types of model interactions.
|
||||
template:
|
||||
# template file ".tmpl" with the prompt template to use by default on the endpoint call. Note there is no extension in the files
|
||||
completion: completion
|
||||
chat: chat
|
||||
edit: edit_template
|
||||
function: function_template
|
||||
chat: "" # Template for chat interactions. Uses golang templates with Sprig functions.
|
||||
chat_message: "" # Template for individual chat messages. Uses golang templates with Sprig functions.
|
||||
completion: "" # Template for generating text completions. Uses golang templates with Sprig functions.
|
||||
edit: "" # Template for edit operations. Uses golang templates with Sprig functions.
|
||||
function: "" # Template for function calls. Uses golang templates with Sprig functions.
|
||||
use_tokenizer_template: false # Whether to use a specific tokenizer template. (vLLM)
|
||||
join_chat_messages_by_character: null # Character to join chat messages, if applicable. Defaults to newline.
|
||||
|
||||
# Function-related settings to control behavior of specific function calls.
|
||||
function:
|
||||
disable_no_action: true
|
||||
no_action_function_name: "reply"
|
||||
no_action_description_name: "Reply to the AI assistant"
|
||||
disable_no_action: false # Whether to disable the no-action behavior.
|
||||
grammar:
|
||||
parallel_calls: false # Allow to return parallel tools
|
||||
disable_parallel_new_lines: false # Disable parallel processing for new lines in grammar checks.
|
||||
mixed_mode: false # Allow mixed-mode grammar enforcing
|
||||
no_mixed_free_string: false # Disallow free strings in mixed mode.
|
||||
disable: false # Completely disable grammar enforcing functionality.
|
||||
prefix: "" # Prefix to add before grammars rules.
|
||||
expect_strings_after_json: false # Expect string after JSON data.
|
||||
no_action_function_name: "" # Function name to call when no action is determined.
|
||||
no_action_description_name: "" # Description name for no-action functions.
|
||||
response_regex: [] # Regular expressions to match response from
|
||||
json_regex_match: [] # Regular expressions to match JSON data when in tool mode
|
||||
replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
|
||||
replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.
|
||||
capture_llm_results: [] # Capture language model results as text result, among JSON, in function calls. For instance, if a model returns a block for "thinking" and a block for "response", this will allow you to capture the thinking block.
|
||||
return_name_in_function_response: false # Some models might prefer to use "name" rather then "function" when returning JSON data. This will allow to use "name" as a key in the JSON response.
|
||||
|
||||
system_prompt:
|
||||
rms_norm_eps:
|
||||
# Set it to 8 for llama2 70b
|
||||
ngqa: 1
|
||||
## LLAMA specific options
|
||||
# Enable F16 if backend supports it
|
||||
f16: true
|
||||
# Enable debugging
|
||||
debug: true
|
||||
# Enable embeddings
|
||||
embeddings: true
|
||||
# Mirostat configuration (llama.cpp only)
|
||||
mirostat_eta: 0.8
|
||||
mirostat_tau: 0.9
|
||||
mirostat: 1
|
||||
# GPU Layers (only used when built with cublas)
|
||||
gpu_layers: 22
|
||||
# Enable memory lock
|
||||
mmlock: true
|
||||
# GPU setting to split the tensor in multiple parts and define a main GPU
|
||||
# see llama.cpp for usage
|
||||
# Feature gating flags to enable experimental or optional features.
|
||||
feature_flags: {}
|
||||
|
||||
# System prompt to use by default.
|
||||
system_prompt: ""
|
||||
|
||||
# Configuration for splitting tensors across GPUs.
|
||||
tensor_split: ""
|
||||
main_gpu: ""
|
||||
# Define a prompt cache path (relative to the models)
|
||||
prompt_cache_path: "prompt-cache"
|
||||
# Cache all the prompts
|
||||
prompt_cache_all: true
|
||||
# Read only
|
||||
prompt_cache_ro: false
|
||||
# Enable mmap
|
||||
mmap: true
|
||||
# Enable low vram mode (GPU only)
|
||||
low_vram: true
|
||||
# Set NUMA mode (CPU only)
|
||||
numa: true
|
||||
# Lora settings
|
||||
lora_adapter: "/path/to/lora/adapter"
|
||||
lora_base: "/path/to/lora/base"
|
||||
# Disable mulmatq (CUDA)
|
||||
no_mulmatq: true
|
||||
|
||||
# Diffusers/transformers
|
||||
cuda: true
|
||||
# Identifier for the main GPU used in multi-GPU setups.
|
||||
main_gpu: ""
|
||||
|
||||
# Small value added to the denominator in RMS normalization to prevent division by zero.
|
||||
rms_norm_eps: 0
|
||||
|
||||
# Natural question generation model parameter.
|
||||
ngqa: 0
|
||||
|
||||
# Path where prompt cache is stored.
|
||||
prompt_cache_path: ""
|
||||
|
||||
# Whether to cache all prompts.
|
||||
prompt_cache_all: false
|
||||
|
||||
# Whether the prompt cache is read-only.
|
||||
prompt_cache_ro: false
|
||||
|
||||
# Mirostat sampling settings.
|
||||
mirostat_eta: null
|
||||
mirostat_tau: null
|
||||
mirostat: null
|
||||
|
||||
# GPU-specific layers configuration.
|
||||
gpu_layers: null
|
||||
|
||||
# Memory mapping for efficient I/O operations.
|
||||
mmap: null
|
||||
|
||||
# Memory locking to ensure data remains in RAM.
|
||||
mmlock: null
|
||||
|
||||
# Mode to use minimal VRAM for GPU operations.
|
||||
low_vram: null
|
||||
|
||||
# Words or phrases that halts processing.
|
||||
stopwords: []
|
||||
|
||||
# Strings to cut from responses to maintain context or relevance.
|
||||
cutstrings: []
|
||||
|
||||
# Strings to trim from responses for cleaner outputs.
|
||||
trimspace: []
|
||||
trimsuffix: []
|
||||
|
||||
# Default context size for the model's understanding of the conversation or text.
|
||||
context_size: null
|
||||
|
||||
# Non-uniform memory access settings, useful for systems with multiple CPUs.
|
||||
numa: false
|
||||
|
||||
# Configuration for LoRA
|
||||
lora_adapter: ""
|
||||
lora_base: ""
|
||||
lora_scale: 0
|
||||
|
||||
# Disable matrix multiplication queuing in GPU operations.
|
||||
no_mulmatq: false
|
||||
|
||||
# Model for generating draft responses.
|
||||
draft_model: ""
|
||||
n_draft: 0
|
||||
|
||||
# Quantization settings for the model, impacting memory and processing speed.
|
||||
quantization: ""
|
||||
|
||||
# Utilization percentage of GPU memory to allocate for the model. (vLLM)
|
||||
gpu_memory_utilization: 0
|
||||
|
||||
# Whether to trust and execute remote code.
|
||||
trust_remote_code: false
|
||||
|
||||
# Force eager execution of TensorFlow operations if applicable. (vLLM)
|
||||
enforce_eager: false
|
||||
|
||||
# Space allocated for swapping data in and out of memory. (vLLM)
|
||||
swap_space: 0
|
||||
|
||||
# Maximum model length, possibly referring to the number of tokens or parameters. (vLLM)
|
||||
max_model_len: 0
|
||||
|
||||
# Size of the tensor parallelism in distributed computing environments. (vLLM)
|
||||
tensor_parallel_size: 0
|
||||
|
||||
# vision model to use for multimodal
|
||||
mmproj: ""
|
||||
|
||||
# Disables offloading of key/value pairs in transformer models to save memory.
|
||||
no_kv_offloading: false
|
||||
|
||||
# Scaling factor for the rope penalty.
|
||||
rope_scaling: ""
|
||||
|
||||
# Type of configuration, often related to the type of task or model architecture.
|
||||
type: ""
|
||||
|
||||
# YARN settings
|
||||
yarn_ext_factor: 0
|
||||
yarn_attn_factor: 0
|
||||
yarn_beta_fast: 0
|
||||
yarn_beta_slow: 0
|
||||
|
||||
# AutoGPT-Q settings, for configurations specific to GPT models.
|
||||
autogptq:
|
||||
model_base_name: "" # Base name of the model.
|
||||
device: "" # Device to run the model on.
|
||||
triton: false # Whether to use Triton Inference Server.
|
||||
use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
|
||||
|
||||
# configuration for diffusers model
|
||||
diffusers:
|
||||
cuda: false # Whether to use CUDA
|
||||
pipeline_type: "" # Type of pipeline to use.
|
||||
scheduler_type: "" # Type of scheduler for controlling operations.
|
||||
enable_parameters: "" # Parameters to enable in the diffuser.
|
||||
cfg_scale: 0 # Scale for CFG in the diffuser setup.
|
||||
img2img: false # Whether image-to-image transformation is supported.
|
||||
clip_skip: 0 # Number of steps to skip in CLIP operations.
|
||||
clip_model: "" # Model to use for CLIP operations.
|
||||
clip_subfolder: "" # Subfolder for storing CLIP-related data.
|
||||
control_net: "" # Control net to use
|
||||
|
||||
# Step count, usually for image processing models
|
||||
step: 0
|
||||
|
||||
# Configuration for gRPC communication.
|
||||
grpc:
|
||||
attempts: 0 # Number of retry attempts for gRPC calls.
|
||||
attempts_sleep_time: 0 # Sleep time between retries.
|
||||
|
||||
# Text-to-Speech (TTS) configuration.
|
||||
tts:
|
||||
voice: "" # Voice setting for TTS.
|
||||
vall-e:
|
||||
audio_path: "" # Path to audio files for Vall-E.
|
||||
|
||||
# Whether to use CUDA for GPU-based operations.
|
||||
cuda: false
|
||||
|
||||
# List of files to download as part of the setup or operations.
|
||||
download_files: []
|
||||
```
|
||||
|
||||
### Prompt templates
|
||||
|
|
33
docs/content/docs/advanced/installer.md
Normal file
33
docs/content/docs/advanced/installer.md
Normal file
|
@ -0,0 +1,33 @@
|
|||
|
||||
+++
|
||||
disableToc = false
|
||||
title = "Installer options"
|
||||
weight = 24
|
||||
+++
|
||||
|
||||
An installation script is available for quick and hassle-free installations, streamlining the setup process for new users.
|
||||
|
||||
Can be used with the following command:
|
||||
```bash
|
||||
curl https://localai.io/install.sh | sh
|
||||
```
|
||||
|
||||
Installation can be configured with Environment variables, for example:
|
||||
|
||||
```bash
|
||||
curl https://localai.io/install.sh | VAR=value sh
|
||||
```
|
||||
|
||||
List of the Environment Variables:
|
||||
| Environment Variable | Description |
|
||||
|----------------------|--------------------------------------------------------------|
|
||||
| **DOCKER_INSTALL** | Set to "true" to enable the installation of Docker images. |
|
||||
| **USE_AIO** | Set to "true" to use the all-in-one LocalAI Docker image. |
|
||||
| **API_KEY** | Specify an API key for accessing LocalAI, if required. |
|
||||
| **CORE_IMAGES** | Set to "true" to download core LocalAI images. |
|
||||
| **PORT** | Specifies the port on which LocalAI will run (default is 8080). |
|
||||
| **THREADS** | Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
|
||||
| **VERSION** | Specifies the version of LocalAI to install. Defaults to the latest available version. |
|
||||
| **MODELS_PATH** | Directory path where LocalAI models are stored (default is /usr/share/local-ai/models). |
|
||||
|
||||
We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!
|
126
docs/content/docs/advanced/run-other-models.md
Normal file
126
docs/content/docs/advanced/run-other-models.md
Normal file
|
@ -0,0 +1,126 @@
|
|||
+++
|
||||
disableToc = false
|
||||
title = "Run other Models"
|
||||
weight = 23
|
||||
icon = "rocket_launch"
|
||||
|
||||
+++
|
||||
|
||||
## Running other models
|
||||
|
||||
> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/manual" %}})_.
|
||||
|
||||
To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/manual" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
|
||||
|
||||
To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
|
||||
|
||||
There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
|
||||
|
||||
{{% alert icon="💡" %}}
|
||||
|
||||
To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI/tree/master/examples/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
|
||||
{{% /alert %}}
|
||||
|
||||
{{< tabs tabTotal="3" >}}
|
||||
{{% tab tabName="CPU-only" %}}
|
||||
|
||||
> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
|
||||
|
||||
| Model | Category | Docker command |
|
||||
| --- | --- | --- |
|
||||
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
|
||||
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
|
||||
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
|
||||
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
|
||||
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
|
||||
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
|
||||
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
|
||||
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
|
||||
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
|
||||
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
|
||||
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
|
||||
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
|
||||
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
|
||||
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
|
||||
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
|
||||
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
|
||||
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
|
||||
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
|
||||
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
|
||||
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
|
||||
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
|
||||
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
|
||||
{{% /tab %}}
|
||||
|
||||
{{% tab tabName="GPU (CUDA 11)" %}}
|
||||
|
||||
|
||||
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
|
||||
|
||||
| Model | Category | Docker command |
|
||||
| --- | --- | --- |
|
||||
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
|
||||
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
|
||||
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
|
||||
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
|
||||
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
|
||||
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
|
||||
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
|
||||
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
|
||||
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
|
||||
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
|
||||
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
|
||||
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
|
||||
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
|
||||
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
|
||||
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
|
||||
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
|
||||
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
|
||||
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
|
||||
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
|
||||
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
|
||||
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
|
||||
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
|
||||
{{% /tab %}}
|
||||
|
||||
|
||||
{{% tab tabName="GPU (CUDA 12)" %}}
|
||||
|
||||
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
|
||||
|
||||
| Model | Category | Docker command |
|
||||
| --- | --- | --- |
|
||||
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
|
||||
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
|
||||
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
|
||||
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
|
||||
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
|
||||
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
|
||||
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
|
||||
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
|
||||
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
|
||||
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
|
||||
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
|
||||
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
|
||||
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
|
||||
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
|
||||
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
|
||||
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
|
||||
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
|
||||
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
|
||||
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
|
||||
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
|
||||
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
|
||||
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
|
||||
{{% /tab %}}
|
||||
|
||||
{{< /tabs >}}
|
||||
|
||||
{{% alert icon="💡" %}}
|
||||
**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
|
||||
|
||||
```bash
|
||||
docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
|
||||
```
|
||||
|
||||
{{% /alert %}}
|
Loading…
Add table
Add a link
Reference in a new issue