feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (#4855)

* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * using " marks in the config.yaml file Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * adding in missing colon Signed-off-by: TheDropZone <brandonbeiler@gmail.com> --------- Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
2025-05-20 02:24:59 +00:00 · 2025-02-18 13:27:58 -05:00 · 2025-02-18 13:27:58 -05:00 · 6a6e1a0ea9
commit 6a6e1a0ea9
parent 5b19af99ff
5 changed files with 64 additions and 23 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -229,6 +229,11 @@ message ModelOptions {
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  bool   DisableLogStatus = 66;
  string DType = 67;
  int32  LimitImagePerPrompt = 68;
  int32  LimitVideoPerPrompt = 69;
  int32  LimitAudioPerPrompt = 70;
  string MMProj = 41;
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
            engine_args.max_model_len = request.MaxModelLen
        if request.DisableLogStatus:
            engine_args.disable_log_status = request.DisableLogStatus
        if request.DType != "":
            engine_args.dtype = request.DType
        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
            engine_args.limit_mm_per_prompt = {
                "image": max(request.LimitImagePerPrompt, 1),
                "video": max(request.LimitVideoPerPrompt, 1),
                "audio": max(request.LimitAudioPerPrompt, 1)
            }
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
 		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
 		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
 		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
 		CacheTypeKey:         c.CacheTypeK,
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -148,6 +148,9 @@ type LLMConfig struct {
 	SwapSpace            int                `yaml:"swap_space"`             // vLLM
 	MaxModelLen          int                `yaml:"max_model_len"`          // vLLM
 	TensorParallelSize   int                `yaml:"tensor_parallel_size"`   // vLLM
 	DisableLogStatus     bool               `yaml:"disable_log_stats"`      // vLLM
 	DType                string             `yaml:"dtype"`                  // vLLM
 	LimitMMPerPrompt     LimitMMPerPrompt   `yaml:"limit_mm_per_prompt"`    // vLLM
 	MMProj               string             `yaml:"mmproj"`
 	FlashAttention bool   `yaml:"flash_attention"`
@ -166,6 +169,13 @@ type LLMConfig struct {
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
 // LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
 type LimitMMPerPrompt struct {
 	LimitImagePerPrompt   int   `yaml:"image"`
 	LimitVideoPerPrompt   int   `yaml:"video"`
 	LimitAudioPerPrompt   int   `yaml:"audio"`
 }
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
--- a/gallery/vllm.yaml
+++ b/gallery/vllm.yaml
@ -16,6 +16,8 @@ config_file: |
      use_tokenizer_template: true
    # Uncomment to specify a quantization method (optional)
    # quantization: "awq"
    # Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
    # dtype: "float16"
    # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
    # gpu_memory_utilization: 0.5
    # Uncomment to trust remote code from huggingface
@ -30,3 +32,10 @@ config_file: |
    # Allows you to partition and run large models. Performance gains are limited.
    # https://github.com/vllm-project/vllm/issues/1435
    # tensor_parallel_size: 2
    # Uncomment to disable log stats
    # disable_log_stats: true
    # Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
    # limit_mm_per_prompt:
    #   image: 2
    #   video: 2
    #   audio: 2