feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (#4855)

* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * using " marks in the config.yaml file Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * adding in missing colon Signed-off-by: TheDropZone <brandonbeiler@gmail.com> --------- Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
2025-05-20 02:24:59 +00:00 · 2025-02-18 13:27:58 -05:00 · 2025-02-18 13:27:58 -05:00 · 6a6e1a0ea9
commit 6a6e1a0ea9
parent 5b19af99ff
5 changed files with 64 additions and 23 deletions
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
+		DisableLogStatus:     c.DisableLogStatus,
+		DType:                c.DType,
+		// LimitMMPerPrompt vLLM
+		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
 		CacheTypeKey:         c.CacheTypeK,
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -130,25 +130,28 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`

-	ContextSize          *int      `yaml:"context_size"`
-	NUMA                 bool      `yaml:"numa"`
-	LoraAdapter          string    `yaml:"lora_adapter"`
-	LoraBase             string    `yaml:"lora_base"`
-	LoraAdapters         []string  `yaml:"lora_adapters"`
-	LoraScales           []float32 `yaml:"lora_scales"`
-	LoraScale            float32   `yaml:"lora_scale"`
-	NoMulMatQ            bool      `yaml:"no_mulmatq"`
-	DraftModel           string    `yaml:"draft_model"`
-	NDraft               int32     `yaml:"n_draft"`
-	Quantization         string    `yaml:"quantization"`
-	LoadFormat           string    `yaml:"load_format"`
-	GPUMemoryUtilization float32   `yaml:"gpu_memory_utilization"` // vLLM
-	TrustRemoteCode      bool      `yaml:"trust_remote_code"`      // vLLM
-	EnforceEager         bool      `yaml:"enforce_eager"`          // vLLM
-	SwapSpace            int       `yaml:"swap_space"`             // vLLM
-	MaxModelLen          int       `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
-	MMProj               string    `yaml:"mmproj"`
+	ContextSize          *int               `yaml:"context_size"`
+	NUMA                 bool               `yaml:"numa"`
+	LoraAdapter          string             `yaml:"lora_adapter"`
+	LoraBase             string             `yaml:"lora_base"`
+	LoraAdapters         []string           `yaml:"lora_adapters"`
+	LoraScales           []float32          `yaml:"lora_scales"`
+	LoraScale            float32            `yaml:"lora_scale"`
+	NoMulMatQ            bool               `yaml:"no_mulmatq"`
+	DraftModel           string             `yaml:"draft_model"`
+	NDraft               int32              `yaml:"n_draft"`
+	Quantization         string             `yaml:"quantization"`
+	LoadFormat           string             `yaml:"load_format"`
+	GPUMemoryUtilization float32            `yaml:"gpu_memory_utilization"` // vLLM
+	TrustRemoteCode      bool               `yaml:"trust_remote_code"`      // vLLM
+	EnforceEager         bool               `yaml:"enforce_eager"`          // vLLM
+	SwapSpace            int                `yaml:"swap_space"`             // vLLM
+	MaxModelLen          int                `yaml:"max_model_len"`          // vLLM
+	TensorParallelSize   int                `yaml:"tensor_parallel_size"`   // vLLM
+	DisableLogStatus     bool               `yaml:"disable_log_stats"`      // vLLM
+	DType                string             `yaml:"dtype"`                  // vLLM
+	LimitMMPerPrompt     LimitMMPerPrompt   `yaml:"limit_mm_per_prompt"`    // vLLM
+	MMProj               string             `yaml:"mmproj"`

 	FlashAttention bool   `yaml:"flash_attention"`
 	NoKVOffloading bool   `yaml:"no_kv_offloading"`
@ -166,6 +169,13 @@ type LLMConfig struct {
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }

+// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
+type LimitMMPerPrompt struct {
+	LimitImagePerPrompt   int   `yaml:"image"`
+	LimitVideoPerPrompt   int   `yaml:"video"`
+	LimitAudioPerPrompt   int   `yaml:"audio"`
+}
+
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`