feat(vllm): Allow to set quantization (#1094)

This particularly useful to set AWQ **Description** Follow up of #1015 **Notes for Reviewers** **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)** - [ ] Yes, I signed my commits.  --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-23 20:14:59 +00:00 · 2023-09-22 15:52:38 +02:00 · 2023-09-22 15:52:38 +02:00 · a28ab18987
commit a28ab18987
parent 048b81373d
13 changed files with 357 additions and 332 deletions
--- a/extra/grpc/vllm/backend_vllm.py
+++ b/extra/grpc/vllm/backend_vllm.py
@ -45,8 +45,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
-            # https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py
-            self.llm = LLM(model=request.Model)
+            if request.Quantization != "":
+                self.llm = LLM(model=request.Model, quantization=request.Quantization)
+            else:
+                self.llm = LLM(model=request.Model)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)