From e19d7226f8116a45ae143712d112b692f99f8e95 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Mon, 8 Jan 2024 00:37:02 +0100
Subject: [PATCH] feat: more embedded models, coqui fixes, add model usage and
 description (#1556)

* feat: add model descriptions and usage

* remove default model gallery

* models: add embeddings and tts

* docs: update table

* docs: updates

* images: cleanup pip cache after install

* images: always run apt-get clean

* ux: improve gRPC connection errors

* ux: improve some messages

* fix: fix coqui when no AudioPath is passed by

* embedded: add more models

* Add usage

* Reorder table
---
 Dockerfile                                    |  8 +--
 api/config/config.go                          | 12 ++++
 .../python/common-env/transformers/install.sh |  9 +++
 backend/python/coqui/coqui_server.py          |  3 +-
 backend/python/exllama/install.sh             |  6 +-
 backend/python/exllama2/install.sh            |  6 +-
 backend/python/vall-e-x/install.sh            |  6 +-
 docs/content/getting_started/_index.en.md     | 61 +++++++++++++------
 docs/content/model-compatibility/_index.en.md |  7 ++-
 embedded/models/all-minilm-l6-v2.yaml         | 13 ++++
 embedded/models/bark.yaml                     |  8 +++
 embedded/models/bert-cpp.yaml                 | 23 +++++++
 embedded/models/coqui.yaml                    |  9 +++
 embedded/models/llava.yaml                    |  7 ++-
 embedded/models/mistral-openorca.yaml         |  6 ++
 embedded/models/rhasspy-voice-en-us-amy.yaml  | 13 ++++
 embedded/models/vall-e-x.yaml                 |  8 +++
 embedded/models/whisper-base.yaml             | 18 ++++++
 pkg/grpc/client.go                            | 14 ++---
 pkg/model/initializers.go                     | 17 +++++-
 pkg/model/loader.go                           |  7 ++-
 21 files changed, 216 insertions(+), 45 deletions(-)
 create mode 100644 embedded/models/all-minilm-l6-v2.yaml
 create mode 100644 embedded/models/bark.yaml
 create mode 100644 embedded/models/bert-cpp.yaml
 create mode 100644 embedded/models/coqui.yaml
 create mode 100644 embedded/models/rhasspy-voice-en-us-amy.yaml
 create mode 100644 embedded/models/vall-e-x.yaml
 create mode 100644 embedded/models/whisper-base.yaml

diff --git a/Dockerfile b/Dockerfile
index 7f7ee817..4a980ef8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,7 +15,6 @@ ENV BUILD_TYPE=${BUILD_TYPE}
 
 ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
 
-ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
 ARG GO_TAGS="stablediffusion tinydream tts"
 
 RUN apt-get update && \
@@ -64,12 +63,12 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo
     echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
     echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
     apt-get update && \
-    apt-get install -y conda
+    apt-get install -y conda && apt-get clean
 
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN pip install --upgrade pip
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak
+RUN apt-get install -y espeak-ng espeak && apt-get clean
 
 ###################################
 ###################################
@@ -127,10 +126,11 @@ ARG CUDA_MAJOR_VERSION=11
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
+ENV PIP_CACHE_PURGE=true
 
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg \
+    apt-get install -y ffmpeg && apt-get clean \
     ; fi
 
 WORKDIR /build
diff --git a/api/config/config.go b/api/config/config.go
index 6aeb48d1..fed83d7a 100644
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -55,6 +55,9 @@ type Config struct {
 	CUDA bool `yaml:"cuda"`
 
 	DownloadFiles []File `yaml:"download_files"`
+
+	Description string `yaml:"description"`
+	Usage       string `yaml:"usage"`
 }
 
 type File struct {
@@ -326,6 +329,15 @@ func (cm *ConfigLoader) Preload(modelPath string) error {
 			c.PredictionOptions.Model = md5Name
 			cm.configs[i] = *c
 		}
+		if cm.configs[i].Name != "" {
+			log.Info().Msgf("Model name: %s", cm.configs[i].Name)
+		}
+		if cm.configs[i].Description != "" {
+			log.Info().Msgf("Model description: %s", cm.configs[i].Description)
+		}
+		if cm.configs[i].Usage != "" {
+			log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage)
+		}
 	}
 	return nil
 }
diff --git a/backend/python/common-env/transformers/install.sh b/backend/python/common-env/transformers/install.sh
index b2fbd54c..42965bdb 100644
--- a/backend/python/common-env/transformers/install.sh
+++ b/backend/python/common-env/transformers/install.sh
@@ -13,3 +13,12 @@ if conda_env_exists "transformers" ; then
 else 
     echo "Virtual environment already exists."
 fi
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    export PATH=$PATH:/opt/conda/bin
+
+    # Activate conda environment
+    source activate transformers
+
+    pip cache purge
+fi
\ No newline at end of file
diff --git a/backend/python/coqui/coqui_server.py b/backend/python/coqui/coqui_server.py
index 70c76021..1c83c4ed 100644
--- a/backend/python/coqui/coqui_server.py
+++ b/backend/python/coqui/coqui_server.py
@@ -21,7 +21,7 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', 'en')
+COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None)
 
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
@@ -38,6 +38,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         if not torch.cuda.is_available() and request.CUDA:
             return backend_pb2.Result(success=False, message="CUDA is not available")
 
+        self.AudioPath = None
         # List available 🐸TTS models
         print(TTS().list_models())
         if os.path.isabs(request.AudioPath):
diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh
index fea582f0..1be2d05c 100755
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -12,4 +12,8 @@ echo $CONDA_PREFIX
 
 git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd
 
-cp -rfv $CONDA_PREFIX/exllama/* ./
\ No newline at end of file
+cp -rfv $CONDA_PREFIX/exllama/* ./
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
\ No newline at end of file
diff --git a/backend/python/exllama2/install.sh b/backend/python/exllama2/install.sh
index 11c9fa51..44d45364 100755
--- a/backend/python/exllama2/install.sh
+++ b/backend/python/exllama2/install.sh
@@ -11,4 +11,8 @@ echo $CONDA_PREFIX
 
 git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd
 
-cp -rfv $CONDA_PREFIX/exllamav2/* ./  
\ No newline at end of file
+cp -rfv $CONDA_PREFIX/exllamav2/* ./  
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
\ No newline at end of file
diff --git a/backend/python/vall-e-x/install.sh b/backend/python/vall-e-x/install.sh
index 29f23684..2fe29d19 100644
--- a/backend/python/vall-e-x/install.sh
+++ b/backend/python/vall-e-x/install.sh
@@ -12,4 +12,8 @@ echo $CONDA_PREFIX
 
 git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd
 
-cp -rfv $CONDA_PREFIX/vall-e-x/* ./
\ No newline at end of file
+cp -rfv $CONDA_PREFIX/vall-e-x/* ./
+
+if [ "$PIP_CACHE_PURGE" = true ] ; then
+    pip cache purge
+fi
\ No newline at end of file
diff --git a/docs/content/getting_started/_index.en.md b/docs/content/getting_started/_index.en.md
index b5543829..2f0062b5 100644
--- a/docs/content/getting_started/_index.en.md
+++ b/docs/content/getting_started/_index.en.md
@@ -143,39 +143,60 @@ Note: this feature currently is available only on master builds.
 You can run `local-ai` directly with a model name, and it will download the model and start the API with the model loaded.
 
 > Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
+> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
+
 
 {{< tabs >}}
 {{% tab name="CPU-only" %}}
 
-| Model | Docker command |
-| --- | --- |
-| phi2 | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
-| llava | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` |
-| mistral-openorca | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
-
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
+| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
+| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
+| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
+| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
+| coqui | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
+| bark | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
+| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
   
 {{% /tab %}}
 {{% tab name="GPU (CUDA 11)" %}}
 
-> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
 
-| Model | Docker command |
-| --- | --- |
-| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
-| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` |
-| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
+
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
+| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
+| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
+| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
+| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
+| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
+| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
+| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
+| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
 
 {{% /tab %}}
 
+
 {{% tab name="GPU (CUDA 12)" %}}
 
-> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version`
-
-| Model | Docker command |
-| --- | --- |
-| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
-| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` |
-| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
+| Model | Category | Docker command |
+| --- | --- | --- |
+| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
+| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` |
+| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
+| bert-cpp | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
+| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
+| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
+| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
+| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
+| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
+| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
 
 {{% /tab %}}
 
@@ -201,7 +222,7 @@ For example, to start localai with phi-2, it's possible for instance to also use
 docker run -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
 ```
 
-The file should be a valid YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}).
+The file should be a valid LocalAI YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}).
 {{% /notice %}}
 
 ### Container images
diff --git a/docs/content/model-compatibility/_index.en.md b/docs/content/model-compatibility/_index.en.md
index 9f95d4e6..53daa60b 100644
--- a/docs/content/model-compatibility/_index.en.md
+++ b/docs/content/model-compatibility/_index.en.md
@@ -43,15 +43,18 @@ Besides llama based models, LocalAI is compatible also with other architectures.
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
 | [piper](https://github.com/rhasspy/piper) ([binding](https://github.com/mudler/go-piper))                                                                     | Any piper onnx model | no                      | Text to voice                        | no                                | no                   | N/A |
 | [falcon](https://github.com/cmp-nct/ggllm.cpp/tree/c12b2d65f732a0d8846db2244e070f0f3e73505c) ([binding](https://github.com/mudler/go-ggllm.cpp))                                                                      | Falcon *** | yes                      | GPT                        | no                                | yes                   | CUDA |
-| `huggingface-embeddings` [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | BERT                   | no                       | Embeddings only                  | yes                               | no                   | N/A |
+| [sentencetransformers](https://github.com/UKPLab/sentence-transformers) | BERT                   | no                       | Embeddings only                  | yes                               | no                   | N/A |
 | `bark`  | bark                   | no                       | Audio generation                  | no                               | no                   | yes |
-| `AutoGPTQ` | GPTQ                   | yes                       | GPT                  | yes                               | no                   | N/A |
+| `autogptq` | GPTQ                   | yes                       | GPT                  | yes                               | no                   | N/A |
 | `exllama`  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | N/A |
 | `diffusers`  | SD,...                   | no                       | Image generation    | no                               | no                   | N/A |
 | `vall-e-x` | Vall-E    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
 | `vllm` | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CPU/CUDA |
 | `exllama2`  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | N/A |
 | `transformers-musicgen`  |                    | no                       | Audio generation                | no                               | no                   | N/A |
+| [tinydream](https://github.com/symisc/tiny-dream#tiny-dreaman-embedded-header-only-stable-diffusion-inference-c-librarypixlabiotiny-dream)         | stablediffusion               | no                       | Image                 | no                                | no                   | N/A |
+| `coqui` | Coqui    | no                       | Audio generation and Voice cloning    | no                               | no                   | CPU/CUDA |
+| `petals` | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CPU/CUDA |
 
 Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "advanced" %}})).
 
diff --git a/embedded/models/all-minilm-l6-v2.yaml b/embedded/models/all-minilm-l6-v2.yaml
new file mode 100644
index 00000000..512d63a4
--- /dev/null
+++ b/embedded/models/all-minilm-l6-v2.yaml
@@ -0,0 +1,13 @@
+name: all-minilm-l6-v2
+backend: sentencetransformers
+embeddings: true
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "all-minilm-l6-v2"
+    }'
\ No newline at end of file
diff --git a/embedded/models/bark.yaml b/embedded/models/bark.yaml
new file mode 100644
index 00000000..da1b1db4
--- /dev/null
+++ b/embedded/models/bark.yaml
@@ -0,0 +1,8 @@
+usage: |
+    bark works without any configuration, to test it, you can run the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
+     "backend": "bark",
+     "input":"Hello, this is a test!"
+    }' | aplay
+# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
\ No newline at end of file
diff --git a/embedded/models/bert-cpp.yaml b/embedded/models/bert-cpp.yaml
new file mode 100644
index 00000000..63d3c7b6
--- /dev/null
+++ b/embedded/models/bert-cpp.yaml
@@ -0,0 +1,23 @@
+backend: bert-embeddings
+embeddings: true
+f16: true
+
+gpu_layers: 90
+mmap: true
+name: bert-cpp-minilm-v6
+
+parameters:
+  model: bert-MiniLM-L6-v2q4_0.bin
+
+download_files:
+- filename: "bert-MiniLM-L6-v2q4_0.bin"
+  sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad"
+  uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin"
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "bert-cpp-minilm-v6"
+    }'
\ No newline at end of file
diff --git a/embedded/models/coqui.yaml b/embedded/models/coqui.yaml
new file mode 100644
index 00000000..5d67f241
--- /dev/null
+++ b/embedded/models/coqui.yaml
@@ -0,0 +1,9 @@
+usage: |
+    coqui works without any configuration, to test it, you can run the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
+        "backend": "coqui",
+        "model": "tts_models/en/ljspeech/glow-tts",
+        "input":"Hello, this is a test!"
+        }'
+# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
\ No newline at end of file
diff --git a/embedded/models/llava.yaml b/embedded/models/llava.yaml
index 662cac83..551eb26b 100644
--- a/embedded/models/llava.yaml
+++ b/embedded/models/llava.yaml
@@ -28,4 +28,9 @@ download_files:
 - filename: bakllava.gguf
   uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
 - filename: bakllava-mmproj.gguf
-  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
\ No newline at end of file
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "llava",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
diff --git a/embedded/models/mistral-openorca.yaml b/embedded/models/mistral-openorca.yaml
index 66a42ab1..3a41c766 100644
--- a/embedded/models/mistral-openorca.yaml
+++ b/embedded/models/mistral-openorca.yaml
@@ -21,3 +21,9 @@ context_size: 4096
 f16: true
 stopwords:
 - <|im_end|>
+
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          "model": "mistral-openorca",
+          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
+      }'
\ No newline at end of file
diff --git a/embedded/models/rhasspy-voice-en-us-amy.yaml b/embedded/models/rhasspy-voice-en-us-amy.yaml
new file mode 100644
index 00000000..911293ca
--- /dev/null
+++ b/embedded/models/rhasspy-voice-en-us-amy.yaml
@@ -0,0 +1,13 @@
+name: voice-en-us-amy-low
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"en-us-amy-low.onnx",
+      "input": "Hi, this is a test."
+    }'
\ No newline at end of file
diff --git a/embedded/models/vall-e-x.yaml b/embedded/models/vall-e-x.yaml
new file mode 100644
index 00000000..b97015f6
--- /dev/null
+++ b/embedded/models/vall-e-x.yaml
@@ -0,0 +1,8 @@
+usage: |
+    Vall-e-x works without any configuration, to test it, you can run the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
+     "backend": "vall-e-x",
+     "input":"Hello, this is a test!"
+    }' | aplay
+# TODO: This is a placeholder until we manage to pre-load HF/Transformers models
\ No newline at end of file
diff --git a/embedded/models/whisper-base.yaml b/embedded/models/whisper-base.yaml
new file mode 100644
index 00000000..f7ebd217
--- /dev/null
+++ b/embedded/models/whisper-base.yaml
@@ -0,0 +1,18 @@
+name: whisper
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
\ No newline at end of file
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index 9eab356d..6f7f83bd 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -50,7 +50,7 @@ func (c *Client) setBusy(v bool) {
 	c.Unlock()
 }
 
-func (c *Client) HealthCheck(ctx context.Context) bool {
+func (c *Client) HealthCheck(ctx context.Context) (bool, error) {
 	if !c.parallel {
 		c.opMutex.Lock()
 		defer c.opMutex.Unlock()
@@ -59,8 +59,7 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
 	defer c.setBusy(false)
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
 	if err != nil {
-		fmt.Println(err)
-		return false
+		return false, err
 	}
 	defer conn.Close()
 	client := pb.NewBackendClient(conn)
@@ -71,15 +70,14 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
 
 	res, err := client.Health(ctx, &pb.HealthMessage{})
 	if err != nil {
-		fmt.Println(err)
-
-		return false
+		return false, err
 	}
 
 	if string(res.Message) == "OK" {
-		return true
+		return true, nil
 	}
-	return false
+
+	return false, fmt.Errorf("health check failed: %s", res.Message)
 }
 
 func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error) {
diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go
index c2182918..e17fc27f 100644
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -131,11 +131,15 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		// Wait for the service to start up
 		ready := false
 		for i := 0; i < o.grpcAttempts; i++ {
-			if client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background()) {
+			alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background())
+			if alive {
 				log.Debug().Msgf("GRPC Service Ready")
 				ready = true
 				break
 			}
+			if err != nil && i == o.grpcAttempts-1 {
+				log.Error().Msgf("Failed starting/connecting to the gRPC service: %s", err.Error())
+			}
 			time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second)
 		}
 
@@ -176,7 +180,11 @@ func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.C
 func (ml *ModelLoader) BackendLoader(opts ...Option) (client *grpc.Client, err error) {
 	o := NewOptions(opts...)
 
-	log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
+	if o.model != "" {
+		log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
+	} else {
+		log.Info().Msgf("Loading model with backend %s", o.backendString)
+	}
 
 	backend := strings.ToLower(o.backendString)
 	if realBackend, exists := Aliases[backend]; exists {
@@ -239,7 +247,10 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
 	for _, b := range o.externalBackends {
 		allBackendsToAutoLoad = append(allBackendsToAutoLoad, b)
 	}
-	log.Info().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
+
+	if o.model != "" {
+		log.Info().Msgf("Trying to load the model '%s' with all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", "))
+	}
 
 	for _, b := range allBackendsToAutoLoad {
 		log.Info().Msgf("[%s] Attempting to load", b)
diff --git a/pkg/model/loader.go b/pkg/model/loader.go
index d02f9e84..686b4298 100644
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -171,9 +171,10 @@ func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress {
 		} else {
 			client = m.GRPC(false, ml.wd)
 		}
-
-		if !client.HealthCheck(context.Background()) {
-			log.Debug().Msgf("GRPC Model not responding: %s", s)
+		alive, err := client.HealthCheck(context.Background())
+		if !alive {
+			log.Warn().Msgf("GRPC Model not responding: %s", err.Error())
+			log.Warn().Msgf("Deleting the process in order to recreate it")
 			if !ml.grpcProcesses[s].IsAlive() {
 				log.Debug().Msgf("GRPC Process is not responding: %s", s)
 				// stop and delete the process, this forces to re-load the model and re-create again the service