From ae1ec4e096b84ace92471f73305c3a1fcb3e02f8 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 23 Oct 2024 15:34:57 +0200 Subject: [PATCH] feat(vllm): expose 'load_format' (#3943) Signed-off-by: Ettore Di Giacinto --- backend/python/vllm/backend.py | 2 ++ core/backend/options.go | 1 + core/config/backend_config.go | 1 + 3 files changed, 4 insertions(+) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 023a14bc..98ac5081 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if request.Quantization != "": engine_args.quantization = request.Quantization + if request.LoadFormat != "": + engine_args.load_format = request.LoadFormat if request.GPUMemoryUtilization != 0: engine_args.gpu_memory_utilization = request.GPUMemoryUtilization if request.TrustRemoteCode: diff --git a/core/backend/options.go b/core/backend/options.go index 90d563e0..82c582c8 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions { DraftModel: c.DraftModel, AudioPath: c.VallE.AudioPath, Quantization: c.Quantization, + LoadFormat: c.LoadFormat, GPUMemoryUtilization: c.GPUMemoryUtilization, TrustRemoteCode: c.TrustRemoteCode, EnforceEager: c.EnforceEager, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index b386d096..c3d1063d 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -143,6 +143,7 @@ type LLMConfig struct { DraftModel string `yaml:"draft_model"` NDraft int32 `yaml:"n_draft"` Quantization string `yaml:"quantization"` + LoadFormat string `yaml:"load_format"` GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM EnforceEager bool `yaml:"enforce_eager"` // vLLM