Bump vLLM version + more options when loading models in vLLM (#1782)

* Bump vLLM version to 0.3.2 * Add vLLM model loading options * Remove transformers-exllama * Fix install exllama
2025-05-20 10:35:01 +00:00 · 2024-03-01 16:48:53 -05:00 · 2024-03-01 16:48:53 -05:00 · 939411300a
commit 939411300a
parent 1c312685aa
28 changed files with 736 additions and 641 deletions
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -40,38 +40,43 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	}

 	return &pb.ModelOptions{
-		ContextSize:    int32(c.ContextSize),
-		Seed:           int32(c.Seed),
-		NBatch:         int32(b),
-		NoMulMatQ:      c.NoMulMatQ,
-		CUDA:           c.CUDA, // diffusers, transformers
-		DraftModel:     c.DraftModel,
-		AudioPath:      c.VallE.AudioPath,
-		Quantization:   c.Quantization,
-		MMProj:         c.MMProj,
-		YarnExtFactor:  c.YarnExtFactor,
-		YarnAttnFactor: c.YarnAttnFactor,
-		YarnBetaFast:   c.YarnBetaFast,
-		YarnBetaSlow:   c.YarnBetaSlow,
-		LoraAdapter:    c.LoraAdapter,
-		LoraBase:       c.LoraBase,
-		LoraScale:      c.LoraScale,
-		NGQA:           c.NGQA,
-		RMSNormEps:     c.RMSNormEps,
-		F16Memory:      c.F16,
-		MLock:          c.MMlock,
-		RopeFreqBase:   c.RopeFreqBase,
-		RopeScaling:    c.RopeScaling,
-		Type:           c.ModelType,
-		RopeFreqScale:  c.RopeFreqScale,
-		NUMA:           c.NUMA,
-		Embeddings:     c.Embeddings,
-		LowVRAM:        c.LowVRAM,
-		NGPULayers:     int32(c.NGPULayers),
-		MMap:           c.MMap,
-		MainGPU:        c.MainGPU,
-		Threads:        int32(c.Threads),
-		TensorSplit:    c.TensorSplit,
+		ContextSize:          int32(c.ContextSize),
+		Seed:                 int32(c.Seed),
+		NBatch:               int32(b),
+		NoMulMatQ:            c.NoMulMatQ,
+		CUDA:                 c.CUDA, // diffusers, transformers
+		DraftModel:           c.DraftModel,
+		AudioPath:            c.VallE.AudioPath,
+		Quantization:         c.Quantization,
+		GPUMemoryUtilization: c.GPUMemoryUtilization,
+		TrustRemoteCode:      c.TrustRemoteCode,
+		EnforceEager:         c.EnforceEager,
+		SwapSpace:            int32(c.SwapSpace),
+		MaxModelLen:          int32(c.MaxModelLen),
+		MMProj:               c.MMProj,
+		YarnExtFactor:        c.YarnExtFactor,
+		YarnAttnFactor:       c.YarnAttnFactor,
+		YarnBetaFast:         c.YarnBetaFast,
+		YarnBetaSlow:         c.YarnBetaSlow,
+		LoraAdapter:          c.LoraAdapter,
+		LoraBase:             c.LoraBase,
+		LoraScale:            c.LoraScale,
+		NGQA:                 c.NGQA,
+		RMSNormEps:           c.RMSNormEps,
+		F16Memory:            c.F16,
+		MLock:                c.MMlock,
+		RopeFreqBase:         c.RopeFreqBase,
+		RopeScaling:          c.RopeScaling,
+		Type:                 c.ModelType,
+		RopeFreqScale:        c.RopeFreqScale,
+		NUMA:                 c.NUMA,
+		Embeddings:           c.Embeddings,
+		LowVRAM:              c.LowVRAM,
+		NGPULayers:           int32(c.NGPULayers),
+		MMap:                 c.MMap,
+		MainGPU:              c.MainGPU,
+		Threads:              int32(c.Threads),
+		TensorSplit:          c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,