Bump vLLM version + more options when loading models in vLLM (#1782)

* Bump vLLM version to 0.3.2 * Add vLLM model loading options * Remove transformers-exllama * Fix install exllama
2025-05-20 02:24:59 +00:00 · 2024-03-01 16:48:53 -05:00 · 2024-03-01 16:48:53 -05:00 · 939411300a
commit 939411300a
parent 1c312685aa
28 changed files with 736 additions and 641 deletions
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -126,6 +126,11 @@ message ModelOptions {

  // vllm
  string Quantization = 40;
+  float  GPUMemoryUtilization = 50;
+  bool   TrustRemoteCode = 51;
+  bool   EnforceEager = 52;
+  int32  SwapSpace = 53;
+  int32  MaxModelLen = 54;

  string MMProj = 41;