mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 02:24:59 +00:00
feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (#4855)
* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * using " marks in the config.yaml file Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * adding in missing colon Signed-off-by: TheDropZone <brandonbeiler@gmail.com> --------- Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
This commit is contained in:
parent
5b19af99ff
commit
6a6e1a0ea9
5 changed files with 64 additions and 23 deletions
|
@ -229,6 +229,11 @@ message ModelOptions {
|
||||||
int32 MaxModelLen = 54;
|
int32 MaxModelLen = 54;
|
||||||
int32 TensorParallelSize = 55;
|
int32 TensorParallelSize = 55;
|
||||||
string LoadFormat = 58;
|
string LoadFormat = 58;
|
||||||
|
bool DisableLogStatus = 66;
|
||||||
|
string DType = 67;
|
||||||
|
int32 LimitImagePerPrompt = 68;
|
||||||
|
int32 LimitVideoPerPrompt = 69;
|
||||||
|
int32 LimitAudioPerPrompt = 70;
|
||||||
|
|
||||||
string MMProj = 41;
|
string MMProj = 41;
|
||||||
|
|
||||||
|
|
|
@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
engine_args.swap_space = request.SwapSpace
|
engine_args.swap_space = request.SwapSpace
|
||||||
if request.MaxModelLen != 0:
|
if request.MaxModelLen != 0:
|
||||||
engine_args.max_model_len = request.MaxModelLen
|
engine_args.max_model_len = request.MaxModelLen
|
||||||
|
if request.DisableLogStatus:
|
||||||
|
engine_args.disable_log_status = request.DisableLogStatus
|
||||||
|
if request.DType != "":
|
||||||
|
engine_args.dtype = request.DType
|
||||||
|
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
|
||||||
|
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
|
||||||
|
engine_args.limit_mm_per_prompt = {
|
||||||
|
"image": max(request.LimitImagePerPrompt, 1),
|
||||||
|
"video": max(request.LimitVideoPerPrompt, 1),
|
||||||
|
"audio": max(request.LimitAudioPerPrompt, 1)
|
||||||
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
|
|
@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||||
SwapSpace: int32(c.SwapSpace),
|
SwapSpace: int32(c.SwapSpace),
|
||||||
MaxModelLen: int32(c.MaxModelLen),
|
MaxModelLen: int32(c.MaxModelLen),
|
||||||
TensorParallelSize: int32(c.TensorParallelSize),
|
TensorParallelSize: int32(c.TensorParallelSize),
|
||||||
|
DisableLogStatus: c.DisableLogStatus,
|
||||||
|
DType: c.DType,
|
||||||
|
// LimitMMPerPrompt vLLM
|
||||||
|
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
|
||||||
|
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
|
||||||
|
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
|
||||||
MMProj: c.MMProj,
|
MMProj: c.MMProj,
|
||||||
FlashAttention: c.FlashAttention,
|
FlashAttention: c.FlashAttention,
|
||||||
CacheTypeKey: c.CacheTypeK,
|
CacheTypeKey: c.CacheTypeK,
|
||||||
|
|
|
@ -148,6 +148,9 @@ type LLMConfig struct {
|
||||||
SwapSpace int `yaml:"swap_space"` // vLLM
|
SwapSpace int `yaml:"swap_space"` // vLLM
|
||||||
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
||||||
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
||||||
|
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
|
||||||
|
DType string `yaml:"dtype"` // vLLM
|
||||||
|
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
|
||||||
MMProj string `yaml:"mmproj"`
|
MMProj string `yaml:"mmproj"`
|
||||||
|
|
||||||
FlashAttention bool `yaml:"flash_attention"`
|
FlashAttention bool `yaml:"flash_attention"`
|
||||||
|
@ -166,6 +169,13 @@ type LLMConfig struct {
|
||||||
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
|
||||||
|
type LimitMMPerPrompt struct {
|
||||||
|
LimitImagePerPrompt int `yaml:"image"`
|
||||||
|
LimitVideoPerPrompt int `yaml:"video"`
|
||||||
|
LimitAudioPerPrompt int `yaml:"audio"`
|
||||||
|
}
|
||||||
|
|
||||||
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
||||||
type AutoGPTQ struct {
|
type AutoGPTQ struct {
|
||||||
ModelBaseName string `yaml:"model_base_name"`
|
ModelBaseName string `yaml:"model_base_name"`
|
||||||
|
|
|
@ -16,6 +16,8 @@ config_file: |
|
||||||
use_tokenizer_template: true
|
use_tokenizer_template: true
|
||||||
# Uncomment to specify a quantization method (optional)
|
# Uncomment to specify a quantization method (optional)
|
||||||
# quantization: "awq"
|
# quantization: "awq"
|
||||||
|
# Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
|
||||||
|
# dtype: "float16"
|
||||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||||
# gpu_memory_utilization: 0.5
|
# gpu_memory_utilization: 0.5
|
||||||
# Uncomment to trust remote code from huggingface
|
# Uncomment to trust remote code from huggingface
|
||||||
|
@ -30,3 +32,10 @@ config_file: |
|
||||||
# Allows you to partition and run large models. Performance gains are limited.
|
# Allows you to partition and run large models. Performance gains are limited.
|
||||||
# https://github.com/vllm-project/vllm/issues/1435
|
# https://github.com/vllm-project/vllm/issues/1435
|
||||||
# tensor_parallel_size: 2
|
# tensor_parallel_size: 2
|
||||||
|
# Uncomment to disable log stats
|
||||||
|
# disable_log_stats: true
|
||||||
|
# Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
|
||||||
|
# limit_mm_per_prompt:
|
||||||
|
# image: 2
|
||||||
|
# video: 2
|
||||||
|
# audio: 2
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue