Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt

Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
2025-06-27 13:15:00 +00:00 · 2025-02-17 16:15:55 -05:00 · 2025-02-17 16:15:55 -05:00 · f0f2c87553
commit f0f2c87553
parent 6424f0666d
5 changed files with 64 additions and 23 deletions
--- a/gallery/vllm.yaml
+++ b/gallery/vllm.yaml
@ -16,6 +16,8 @@ config_file: |
      use_tokenizer_template: true
    # Uncomment to specify a quantization method (optional)
    # quantization: "awq"
+    # Uncomment to set dtype, choices are: 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'. awq on vLLM does not support bfloat16
+    # dtype: 'float16'
    # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
    # gpu_memory_utilization: 0.5
    # Uncomment to trust remote code from huggingface
@ -30,3 +32,10 @@ config_file: |
    # Allows you to partition and run large models. Performance gains are limited.
    # https://github.com/vllm-project/vllm/issues/1435
    # tensor_parallel_size: 2
+    # Uncomment to disable log stats
+    # disable_log_stats: true
+    # Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
+    # limit_mm_per_prompt:
+    #   image: 2
+    #   video: 2
+    #   audio: 2