From e19d7226f8116a45ae143712d112b692f99f8e95 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 8 Jan 2024 00:37:02 +0100 Subject: [PATCH] feat: more embedded models, coqui fixes, add model usage and description (#1556) * feat: add model descriptions and usage * remove default model gallery * models: add embeddings and tts * docs: update table * docs: updates * images: cleanup pip cache after install * images: always run apt-get clean * ux: improve gRPC connection errors * ux: improve some messages * fix: fix coqui when no AudioPath is passed by * embedded: add more models * Add usage * Reorder table --- Dockerfile | 8 +-- api/config/config.go | 12 ++++ .../python/common-env/transformers/install.sh | 9 +++ backend/python/coqui/coqui_server.py | 3 +- backend/python/exllama/install.sh | 6 +- backend/python/exllama2/install.sh | 6 +- backend/python/vall-e-x/install.sh | 6 +- docs/content/getting_started/_index.en.md | 61 +++++++++++++------ docs/content/model-compatibility/_index.en.md | 7 ++- embedded/models/all-minilm-l6-v2.yaml | 13 ++++ embedded/models/bark.yaml | 8 +++ embedded/models/bert-cpp.yaml | 23 +++++++ embedded/models/coqui.yaml | 9 +++ embedded/models/llava.yaml | 7 ++- embedded/models/mistral-openorca.yaml | 6 ++ embedded/models/rhasspy-voice-en-us-amy.yaml | 13 ++++ embedded/models/vall-e-x.yaml | 8 +++ embedded/models/whisper-base.yaml | 18 ++++++ pkg/grpc/client.go | 14 ++--- pkg/model/initializers.go | 17 +++++- pkg/model/loader.go | 7 ++- 21 files changed, 216 insertions(+), 45 deletions(-) create mode 100644 embedded/models/all-minilm-l6-v2.yaml create mode 100644 embedded/models/bark.yaml create mode 100644 embedded/models/bert-cpp.yaml create mode 100644 embedded/models/coqui.yaml create mode 100644 embedded/models/rhasspy-voice-en-us-amy.yaml create mode 100644 embedded/models/vall-e-x.yaml create mode 100644 embedded/models/whisper-base.yaml diff --git a/Dockerfile b/Dockerfile index 7f7ee817..4a980ef8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,6 @@ ENV BUILD_TYPE=${BUILD_TYPE} ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh" -ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]' ARG GO_TAGS="stablediffusion tinydream tts" RUN apt-get update && \ @@ -64,12 +63,12 @@ RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmo echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \ echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \ apt-get update && \ - apt-get install -y conda + apt-get install -y conda && apt-get clean ENV PATH="/root/.cargo/bin:${PATH}" RUN pip install --upgrade pip RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -RUN apt-get install -y espeak-ng espeak +RUN apt-get install -y espeak-ng espeak && apt-get clean ################################### ################################### @@ -127,10 +126,11 @@ ARG CUDA_MAJOR_VERSION=11 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0" ENV NVIDIA_VISIBLE_DEVICES=all +ENV PIP_CACHE_PURGE=true # Add FFmpeg RUN if [ "${FFMPEG}" = "true" ]; then \ - apt-get install -y ffmpeg \ + apt-get install -y ffmpeg && apt-get clean \ ; fi WORKDIR /build diff --git a/api/config/config.go b/api/config/config.go index 6aeb48d1..fed83d7a 100644 --- a/api/config/config.go +++ b/api/config/config.go @@ -55,6 +55,9 @@ type Config struct { CUDA bool `yaml:"cuda"` DownloadFiles []File `yaml:"download_files"` + + Description string `yaml:"description"` + Usage string `yaml:"usage"` } type File struct { @@ -326,6 +329,15 @@ func (cm *ConfigLoader) Preload(modelPath string) error { c.PredictionOptions.Model = md5Name cm.configs[i] = *c } + if cm.configs[i].Name != "" { + log.Info().Msgf("Model name: %s", cm.configs[i].Name) + } + if cm.configs[i].Description != "" { + log.Info().Msgf("Model description: %s", cm.configs[i].Description) + } + if cm.configs[i].Usage != "" { + log.Info().Msgf("Model usage: \n%s", cm.configs[i].Usage) + } } return nil } diff --git a/backend/python/common-env/transformers/install.sh b/backend/python/common-env/transformers/install.sh index b2fbd54c..42965bdb 100644 --- a/backend/python/common-env/transformers/install.sh +++ b/backend/python/common-env/transformers/install.sh @@ -13,3 +13,12 @@ if conda_env_exists "transformers" ; then else echo "Virtual environment already exists." fi + +if [ "$PIP_CACHE_PURGE" = true ] ; then + export PATH=$PATH:/opt/conda/bin + + # Activate conda environment + source activate transformers + + pip cache purge +fi \ No newline at end of file diff --git a/backend/python/coqui/coqui_server.py b/backend/python/coqui/coqui_server.py index 70c76021..1c83c4ed 100644 --- a/backend/python/coqui/coqui_server.py +++ b/backend/python/coqui/coqui_server.py @@ -21,7 +21,7 @@ _ONE_DAY_IN_SECONDS = 60 * 60 * 24 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) -COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', 'en') +COQUI_LANGUAGE = os.environ.get('COQUI_LANGUAGE', None) # Implement the BackendServicer class with the service methods class BackendServicer(backend_pb2_grpc.BackendServicer): @@ -38,6 +38,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if not torch.cuda.is_available() and request.CUDA: return backend_pb2.Result(success=False, message="CUDA is not available") + self.AudioPath = None # List available 🐸TTS models print(TTS().list_models()) if os.path.isabs(request.AudioPath): diff --git a/backend/python/exllama/install.sh b/backend/python/exllama/install.sh index fea582f0..1be2d05c 100755 --- a/backend/python/exllama/install.sh +++ b/backend/python/exllama/install.sh @@ -12,4 +12,8 @@ echo $CONDA_PREFIX git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd -cp -rfv $CONDA_PREFIX/exllama/* ./ \ No newline at end of file +cp -rfv $CONDA_PREFIX/exllama/* ./ + +if [ "$PIP_CACHE_PURGE" = true ] ; then + pip cache purge +fi \ No newline at end of file diff --git a/backend/python/exllama2/install.sh b/backend/python/exllama2/install.sh index 11c9fa51..44d45364 100755 --- a/backend/python/exllama2/install.sh +++ b/backend/python/exllama2/install.sh @@ -11,4 +11,8 @@ echo $CONDA_PREFIX git clone https://github.com/turboderp/exllamav2 $CONDA_PREFIX/exllamav2 && pushd $CONDA_PREFIX/exllamav2 && pip install -r requirements.txt && popd -cp -rfv $CONDA_PREFIX/exllamav2/* ./ \ No newline at end of file +cp -rfv $CONDA_PREFIX/exllamav2/* ./ + +if [ "$PIP_CACHE_PURGE" = true ] ; then + pip cache purge +fi \ No newline at end of file diff --git a/backend/python/vall-e-x/install.sh b/backend/python/vall-e-x/install.sh index 29f23684..2fe29d19 100644 --- a/backend/python/vall-e-x/install.sh +++ b/backend/python/vall-e-x/install.sh @@ -12,4 +12,8 @@ echo $CONDA_PREFIX git clone https://github.com/Plachtaa/VALL-E-X.git $CONDA_PREFIX/vall-e-x && pushd $CONDA_PREFIX/vall-e-x && git checkout -b build $SHA && pip install -r requirements.txt && popd -cp -rfv $CONDA_PREFIX/vall-e-x/* ./ \ No newline at end of file +cp -rfv $CONDA_PREFIX/vall-e-x/* ./ + +if [ "$PIP_CACHE_PURGE" = true ] ; then + pip cache purge +fi \ No newline at end of file diff --git a/docs/content/getting_started/_index.en.md b/docs/content/getting_started/_index.en.md index b5543829..2f0062b5 100644 --- a/docs/content/getting_started/_index.en.md +++ b/docs/content/getting_started/_index.en.md @@ -143,39 +143,60 @@ Note: this feature currently is available only on master builds. You can run `local-ai` directly with a model name, and it will download the model and start the API with the model loaded. > Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies +> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` + {{< tabs >}} {{% tab name="CPU-only" %}} -| Model | Docker command | -| --- | --- | -| phi2 | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` | -| llava | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` | -| mistral-openorca | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` | - +| Model | Category | Docker command | +| --- | --- | --- | +| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` | +| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava``` | +| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` | +| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` | +| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` | +| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` | +| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` | +| coqui | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` | +| bark | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` | +| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` | {{% /tab %}} {{% tab name="GPU (CUDA 11)" %}} -> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` -| Model | Docker command | -| --- | --- | -| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` | -| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` | -| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` | + +| Model | Category | Docker command | +| --- | --- | --- | +| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` | +| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core llava``` | +| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` | +| [bert-cpp](https://github.com/skeskinen/bert.cpp) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` | +| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` | +| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` | +| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` | +| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` | +| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` | +| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` | {{% /tab %}} + {{% tab name="GPU (CUDA 12)" %}} -> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` - -| Model | Docker command | -| --- | --- | -| phi-2 | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` | -| llava | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` | -| mistral-openorca | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` | +| Model | Category | Docker command | +| --- | --- | --- | +| [phi-2](https://huggingface.co/microsoft/phi-2) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` | +| [llava](https://github.com/SkunkworksAI/BakLLaVA) | Multimodal LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core llava``` | +| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | LLM | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` | +| bert-cpp | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` | +| all-minilm-l6-v2 | Embeddings | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` | +| whisper-base | Audio to Text | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` | +| rhasspy-voice-en-us-amy | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` | +| coqui | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` | +| bark | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` | +| vall-e-x | Text to Audio | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` | {{% /tab %}} @@ -201,7 +222,7 @@ For example, to start localai with phi-2, it's possible for instance to also use docker run -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml ``` -The file should be a valid YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}). +The file should be a valid LocalAI YAML configuration file, for the full syntax see [advanced]({{%relref "advanced" %}}). {{% /notice %}} ### Container images diff --git a/docs/content/model-compatibility/_index.en.md b/docs/content/model-compatibility/_index.en.md index 9f95d4e6..53daa60b 100644 --- a/docs/content/model-compatibility/_index.en.md +++ b/docs/content/model-compatibility/_index.en.md @@ -43,15 +43,18 @@ Besides llama based models, LocalAI is compatible also with other architectures. | [langchain-huggingface](https://github.com/tmc/langchaingo) | Any text generators available on HuggingFace through API | yes | GPT | no | no | N/A | | [piper](https://github.com/rhasspy/piper) ([binding](https://github.com/mudler/go-piper)) | Any piper onnx model | no | Text to voice | no | no | N/A | | [falcon](https://github.com/cmp-nct/ggllm.cpp/tree/c12b2d65f732a0d8846db2244e070f0f3e73505c) ([binding](https://github.com/mudler/go-ggllm.cpp)) | Falcon *** | yes | GPT | no | yes | CUDA | -| `huggingface-embeddings` [sentence-transformers](https://github.com/UKPLab/sentence-transformers) | BERT | no | Embeddings only | yes | no | N/A | +| [sentencetransformers](https://github.com/UKPLab/sentence-transformers) | BERT | no | Embeddings only | yes | no | N/A | | `bark` | bark | no | Audio generation | no | no | yes | -| `AutoGPTQ` | GPTQ | yes | GPT | yes | no | N/A | +| `autogptq` | GPTQ | yes | GPT | yes | no | N/A | | `exllama` | GPTQ | yes | GPT only | no | no | N/A | | `diffusers` | SD,... | no | Image generation | no | no | N/A | | `vall-e-x` | Vall-E | no | Audio generation and Voice cloning | no | no | CPU/CUDA | | `vllm` | Various GPTs and quantization formats | yes | GPT | no | no | CPU/CUDA | | `exllama2` | GPTQ | yes | GPT only | no | no | N/A | | `transformers-musicgen` | | no | Audio generation | no | no | N/A | +| [tinydream](https://github.com/symisc/tiny-dream#tiny-dreaman-embedded-header-only-stable-diffusion-inference-c-librarypixlabiotiny-dream) | stablediffusion | no | Image | no | no | N/A | +| `coqui` | Coqui | no | Audio generation and Voice cloning | no | no | CPU/CUDA | +| `petals` | Various GPTs and quantization formats | yes | GPT | no | no | CPU/CUDA | Note: any backend name listed above can be used in the `backend` field of the model configuration file (See [the advanced section]({{%relref "advanced" %}})). diff --git a/embedded/models/all-minilm-l6-v2.yaml b/embedded/models/all-minilm-l6-v2.yaml new file mode 100644 index 00000000..512d63a4 --- /dev/null +++ b/embedded/models/all-minilm-l6-v2.yaml @@ -0,0 +1,13 @@ +name: all-minilm-l6-v2 +backend: sentencetransformers +embeddings: true +parameters: + model: all-MiniLM-L6-v2 + +usage: | + You can test this model with curl like this: + + curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{ + "input": "Your text string goes here", + "model": "all-minilm-l6-v2" + }' \ No newline at end of file diff --git a/embedded/models/bark.yaml b/embedded/models/bark.yaml new file mode 100644 index 00000000..da1b1db4 --- /dev/null +++ b/embedded/models/bark.yaml @@ -0,0 +1,8 @@ +usage: | + bark works without any configuration, to test it, you can run the following curl command: + + curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "backend": "bark", + "input":"Hello, this is a test!" + }' | aplay +# TODO: This is a placeholder until we manage to pre-load HF/Transformers models \ No newline at end of file diff --git a/embedded/models/bert-cpp.yaml b/embedded/models/bert-cpp.yaml new file mode 100644 index 00000000..63d3c7b6 --- /dev/null +++ b/embedded/models/bert-cpp.yaml @@ -0,0 +1,23 @@ +backend: bert-embeddings +embeddings: true +f16: true + +gpu_layers: 90 +mmap: true +name: bert-cpp-minilm-v6 + +parameters: + model: bert-MiniLM-L6-v2q4_0.bin + +download_files: +- filename: "bert-MiniLM-L6-v2q4_0.bin" + sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad" + uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin" + +usage: | + You can test this model with curl like this: + + curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{ + "input": "Your text string goes here", + "model": "bert-cpp-minilm-v6" + }' \ No newline at end of file diff --git a/embedded/models/coqui.yaml b/embedded/models/coqui.yaml new file mode 100644 index 00000000..5d67f241 --- /dev/null +++ b/embedded/models/coqui.yaml @@ -0,0 +1,9 @@ +usage: | + coqui works without any configuration, to test it, you can run the following curl command: + + curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "backend": "coqui", + "model": "tts_models/en/ljspeech/glow-tts", + "input":"Hello, this is a test!" + }' +# TODO: This is a placeholder until we manage to pre-load HF/Transformers models \ No newline at end of file diff --git a/embedded/models/llava.yaml b/embedded/models/llava.yaml index 662cac83..551eb26b 100644 --- a/embedded/models/llava.yaml +++ b/embedded/models/llava.yaml @@ -28,4 +28,9 @@ download_files: - filename: bakllava.gguf uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf - filename: bakllava-mmproj.gguf - uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf \ No newline at end of file + uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf + +usage: | + curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "llava", + "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}' diff --git a/embedded/models/mistral-openorca.yaml b/embedded/models/mistral-openorca.yaml index 66a42ab1..3a41c766 100644 --- a/embedded/models/mistral-openorca.yaml +++ b/embedded/models/mistral-openorca.yaml @@ -21,3 +21,9 @@ context_size: 4096 f16: true stopwords: - <|im_end|> + +usage: | + curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "mistral-openorca", + "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}] + }' \ No newline at end of file diff --git a/embedded/models/rhasspy-voice-en-us-amy.yaml b/embedded/models/rhasspy-voice-en-us-amy.yaml new file mode 100644 index 00000000..911293ca --- /dev/null +++ b/embedded/models/rhasspy-voice-en-us-amy.yaml @@ -0,0 +1,13 @@ +name: voice-en-us-amy-low +download_files: + - filename: voice-en-us-amy-low.tar.gz + uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz + + +usage: | + To test if this model works as expected, you can use the following curl command: + + curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "model":"en-us-amy-low.onnx", + "input": "Hi, this is a test." + }' \ No newline at end of file diff --git a/embedded/models/vall-e-x.yaml b/embedded/models/vall-e-x.yaml new file mode 100644 index 00000000..b97015f6 --- /dev/null +++ b/embedded/models/vall-e-x.yaml @@ -0,0 +1,8 @@ +usage: | + Vall-e-x works without any configuration, to test it, you can run the following curl command: + + curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "backend": "vall-e-x", + "input":"Hello, this is a test!" + }' | aplay +# TODO: This is a placeholder until we manage to pre-load HF/Transformers models \ No newline at end of file diff --git a/embedded/models/whisper-base.yaml b/embedded/models/whisper-base.yaml new file mode 100644 index 00000000..f7ebd217 --- /dev/null +++ b/embedded/models/whisper-base.yaml @@ -0,0 +1,18 @@ +name: whisper +backend: whisper +parameters: + model: ggml-whisper-base.bin + +usage: | + ## example audio file + wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg + + ## Send the example audio file to the transcriptions endpoint + curl http://localhost:8080/v1/audio/transcriptions \ + -H "Content-Type: multipart/form-data" \ + -F file="@$PWD/gb1.ogg" -F model="whisper" + +download_files: +- filename: "ggml-whisper-base.bin" + sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe" + uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin" \ No newline at end of file diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go index 9eab356d..6f7f83bd 100644 --- a/pkg/grpc/client.go +++ b/pkg/grpc/client.go @@ -50,7 +50,7 @@ func (c *Client) setBusy(v bool) { c.Unlock() } -func (c *Client) HealthCheck(ctx context.Context) bool { +func (c *Client) HealthCheck(ctx context.Context) (bool, error) { if !c.parallel { c.opMutex.Lock() defer c.opMutex.Unlock() @@ -59,8 +59,7 @@ func (c *Client) HealthCheck(ctx context.Context) bool { defer c.setBusy(false) conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { - fmt.Println(err) - return false + return false, err } defer conn.Close() client := pb.NewBackendClient(conn) @@ -71,15 +70,14 @@ func (c *Client) HealthCheck(ctx context.Context) bool { res, err := client.Health(ctx, &pb.HealthMessage{}) if err != nil { - fmt.Println(err) - - return false + return false, err } if string(res.Message) == "OK" { - return true + return true, nil } - return false + + return false, fmt.Errorf("health check failed: %s", res.Message) } func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.EmbeddingResult, error) { diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index c2182918..e17fc27f 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -131,11 +131,15 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string // Wait for the service to start up ready := false for i := 0; i < o.grpcAttempts; i++ { - if client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background()) { + alive, err := client.GRPC(o.parallelRequests, ml.wd).HealthCheck(context.Background()) + if alive { log.Debug().Msgf("GRPC Service Ready") ready = true break } + if err != nil && i == o.grpcAttempts-1 { + log.Error().Msgf("Failed starting/connecting to the gRPC service: %s", err.Error()) + } time.Sleep(time.Duration(o.grpcAttemptsDelay) * time.Second) } @@ -176,7 +180,11 @@ func (ml *ModelLoader) resolveAddress(addr ModelAddress, parallel bool) (*grpc.C func (ml *ModelLoader) BackendLoader(opts ...Option) (client *grpc.Client, err error) { o := NewOptions(opts...) - log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString) + if o.model != "" { + log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString) + } else { + log.Info().Msgf("Loading model with backend %s", o.backendString) + } backend := strings.ToLower(o.backendString) if realBackend, exists := Aliases[backend]; exists { @@ -239,7 +247,10 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) { for _, b := range o.externalBackends { allBackendsToAutoLoad = append(allBackendsToAutoLoad, b) } - log.Info().Msgf("Loading model '%s' greedly from all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", ")) + + if o.model != "" { + log.Info().Msgf("Trying to load the model '%s' with all the available backends: %s", o.model, strings.Join(allBackendsToAutoLoad, ", ")) + } for _, b := range allBackendsToAutoLoad { log.Info().Msgf("[%s] Attempting to load", b) diff --git a/pkg/model/loader.go b/pkg/model/loader.go index d02f9e84..686b4298 100644 --- a/pkg/model/loader.go +++ b/pkg/model/loader.go @@ -171,9 +171,10 @@ func (ml *ModelLoader) CheckIsLoaded(s string) ModelAddress { } else { client = m.GRPC(false, ml.wd) } - - if !client.HealthCheck(context.Background()) { - log.Debug().Msgf("GRPC Model not responding: %s", s) + alive, err := client.HealthCheck(context.Background()) + if !alive { + log.Warn().Msgf("GRPC Model not responding: %s", err.Error()) + log.Warn().Msgf("Deleting the process in order to recreate it") if !ml.grpcProcesses[s].IsAlive() { log.Debug().Msgf("GRPC Process is not responding: %s", s) // stop and delete the process, this forces to re-load the model and re-create again the service