From ae1ec4e096b84ace92471f73305c3a1fcb3e02f8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:34:57 +0200
Subject: [PATCH] feat(vllm): expose 'load_format' (#3943)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/vllm/backend.py | 2 ++
 core/backend/options.go        | 1 +
 core/config/backend_config.go  | 1 +
 3 files changed, 4 insertions(+)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 023a14bc..98ac5081 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -95,6 +95,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 
         if request.Quantization != "":
             engine_args.quantization = request.Quantization
+        if request.LoadFormat != "":
+            engine_args.load_format = request.LoadFormat
         if request.GPUMemoryUtilization != 0:
             engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
         if request.TrustRemoteCode:
diff --git a/core/backend/options.go b/core/backend/options.go
index 90d563e0..82c582c8 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -139,6 +139,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DraftModel:           c.DraftModel,
 		AudioPath:            c.VallE.AudioPath,
 		Quantization:         c.Quantization,
+		LoadFormat:           c.LoadFormat,
 		GPUMemoryUtilization: c.GPUMemoryUtilization,
 		TrustRemoteCode:      c.TrustRemoteCode,
 		EnforceEager:         c.EnforceEager,
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
index b386d096..c3d1063d 100644
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -143,6 +143,7 @@ type LLMConfig struct {
 	DraftModel           string  `yaml:"draft_model"`
 	NDraft               int32   `yaml:"n_draft"`
 	Quantization         string  `yaml:"quantization"`
+	LoadFormat           string  `yaml:"load_format"`
 	GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
 	TrustRemoteCode      bool    `yaml:"trust_remote_code"`      // vLLM
 	EnforceEager         bool    `yaml:"enforce_eager"`          // vLLM