mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
Bump vLLM version + more options when loading models in vLLM (#1782)
* Bump vLLM version to 0.3.2 * Add vLLM model loading options * Remove transformers-exllama * Fix install exllama
This commit is contained in:
parent
1c312685aa
commit
939411300a
28 changed files with 736 additions and 641 deletions
File diff suppressed because one or more lines are too long
|
@ -88,6 +88,16 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
|
||||
if request.Quantization != "":
|
||||
engine_args.quantization = request.Quantization
|
||||
if request.GPUMemoryUtilization != 0:
|
||||
engine_args.gpu_memory_utilization = request.GPUMemoryUtilization
|
||||
if request.TrustRemoteCode:
|
||||
engine_args.trust_remote_code = request.TrustRemoteCode
|
||||
if request.EnforceEager:
|
||||
engine_args.enforce_eager = request.EnforceEager
|
||||
if request.SwapSpace != 0:
|
||||
engine_args.swap_space = request.SwapSpace
|
||||
if request.MaxModelLen != 0:
|
||||
engine_args.max_model_len = request.MaxModelLen
|
||||
|
||||
try:
|
||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue