Bump vLLM version + more options when loading models in vLLM (#1782)

* Bump vLLM version to 0.3.2 * Add vLLM model loading options * Remove transformers-exllama * Fix install exllama
2025-05-20 10:35:01 +00:00 · 2024-03-01 16:48:53 -05:00 · 2024-03-01 16:48:53 -05:00 · 939411300a
commit 939411300a
parent 1c312685aa
28 changed files with 736 additions and 641 deletions
--- a/backend/python/autogptq/autogptq.yml
+++ b/backend/python/autogptq/autogptq.yml
@ -71,7 +71,7 @@ dependencies:
      - regex==2023.10.3
      - requests==2.31.0
      - rouge==1.0.1
-      - safetensors==0.3.3
+      - safetensors>=0.3.3
      - six==1.16.0
      - sympy==1.12
      - tokenizers==0.14.0
--- a/backend/python/autogptq/backend_pb2.py
+++ b/backend/python/autogptq/backend_pb2.py
--- a/backend/python/bark/backend_pb2.py
+++ b/backend/python/bark/backend_pb2.py
--- a/backend/python/common-env/transformers/transformers-nvidia.yml
+++ b/backend/python/common-env/transformers/transformers-nvidia.yml
@ -81,7 +81,7 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
+      - safetensors>=0.4.1
      - scipy==1.11.3
      - six==1.16.0
      - sympy==1.12
@ -113,7 +113,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0  # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.0  # Required for Gemma.
      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers-rocm.yml
+++ b/backend/python/common-env/transformers/transformers-rocm.yml
@ -71,7 +71,7 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
+      - safetensors>=0.4.1
      - scipy==1.11.3
      - six==1.16.0
      - sympy==1.12
@ -103,7 +103,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0 # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.0 # Required for Gemma.
      - xformers==0.0.23.post1
 prefix: /opt/conda/envs/transformers
--- a/backend/python/common-env/transformers/transformers.yml
+++ b/backend/python/common-env/transformers/transformers.yml
@ -69,7 +69,7 @@ dependencies:
      - requests==2.31.0
      - rouge==1.0.1
      - s3transfer==0.7.0
-      - safetensors==0.3.3
+      - safetensors>=0.4.1
      - scipy==1.11.3
      - six==1.16.0
      - sympy==1.12
@ -101,7 +101,7 @@ dependencies:
      - sudachipy
      - sudachidict_core
      - vocos
-      - vllm==0.2.7
-      - transformers>=4.36.0  # Required for Mixtral.
+      - vllm==0.3.2
+      - transformers>=4.38.0  # Required for Gemma.
      - xformers==0.0.23.post1  
 prefix: /opt/conda/envs/transformers
--- a/backend/python/coqui/backend_pb2.py
+++ b/backend/python/coqui/backend_pb2.py
--- a/backend/python/diffusers/backend_pb2.py
+++ b/backend/python/diffusers/backend_pb2.py
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@ -1,7 +1,8 @@
+export CONDA_ENV_PATH = "exllama.yml"
+
 .PHONY: exllama
 exllama:
-	$(MAKE) -C ../common-env/transformers
-	bash install.sh
+	bash install.sh ${CONDA_ENV_PATH}

 .PHONY: run
 run:
--- a/backend/python/exllama/backend_pb2.py
+++ b/backend/python/exllama/backend_pb2.py
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@ -1,14 +1,22 @@
 #!/bin/bash
+set -ex

-##
-## A bash script installs the required dependencies of VALL-E-X and prepares the environment
 export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+# Check if environment exist
+conda_env_exists(){
+    ! conda list --name "${@}" >/dev/null 2>/dev/null
+}

-echo $CONDA_PREFIX
+if conda_env_exists "exllama" ; then
+    echo "Creating virtual environment..."
+    conda env create --name exllama --file $1
+    echo "Virtual environment created."
+else
+    echo "Virtual environment already exists."
+fi

+source activate exllama

 git clone https://github.com/turboderp/exllama $CONDA_PREFIX/exllama && pushd $CONDA_PREFIX/exllama && pip install -r requirements.txt && popd

--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@ -2,11 +2,10 @@

 ##
 ## A bash script wrapper that runs the exllama server with conda
-
 export PATH=$PATH:/opt/conda/bin

 # Activate conda environment
-source activate transformers
+source activate exllama

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/exllama2/backend_pb2.py
+++ b/backend/python/exllama2/backend_pb2.py
--- a/backend/python/mamba/backend_pb2.py
+++ b/backend/python/mamba/backend_pb2.py
--- a/backend/python/petals/backend_pb2.py
+++ b/backend/python/petals/backend_pb2.py
--- a/backend/python/sentencetransformers/backend_pb2.py
+++ b/backend/python/sentencetransformers/backend_pb2.py
--- a/backend/python/transformers-musicgen/backend_pb2.py
+++ b/backend/python/transformers-musicgen/backend_pb2.py
--- a/backend/python/transformers/backend_pb2.py
+++ b/backend/python/transformers/backend_pb2.py
--- a/backend/python/vall-e-x/backend_pb2.py
+++ b/backend/python/vall-e-x/backend_pb2.py
--- a/backend/python/vall-e-x/ttsvalle.yml
+++ b/backend/python/vall-e-x/ttsvalle.yml
@ -79,7 +79,7 @@ dependencies:
      - pypinyin==0.49.0
      - python-multipart==0.0.6
      - regex==2023.10.3
-      - safetensors==0.4.0
+      - safetensors>=0.4.0
      - semantic-version==2.10.0
      - soundfile==0.12.1
      - starlette==0.27.0
--- a/backend/python/vllm/backend_pb2.py
+++ b/backend/python/vllm/backend_pb2.py
--- a/backend/python/vllm/backend_vllm.py
+++ b/backend/python/vllm/backend_vllm.py
@ -88,6 +88,16 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        if request.Quantization != "":
            engine_args.quantization = request.Quantization
+        if request.GPUMemoryUtilization != 0:
+            engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
+        if request.TrustRemoteCode:
+            engine_args.trust_remote_code = request.TrustRemoteCode
+        if request.EnforceEager:
+            engine_args.enforce_eager = request.EnforceEager
+        if request.SwapSpace != 0:
+            engine_args.swap_space = request.SwapSpace
+        if request.MaxModelLen != 0:
+            engine_args.max_model_len = request.MaxModelLen

        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)