feat(vllm): add support for embeddings (#3440)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 18:45:00 +00:00 · 2024-09-02 21:44:32 +02:00 · 2024-09-02 21:44:32 +02:00 · 68fc014c6d
commit 68fc014c6d
parent 56db715a91
2 changed files with 43 additions and 0 deletions
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -135,6 +135,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        res = await gen.__anext__()
        return res
    def Embedding(self, request, context):
        """
        A gRPC method that calculates embeddings for a given sentence.
        Args:
            request: An EmbeddingRequest object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            An EmbeddingResult object that contains the calculated embeddings.
        """
        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
        outputs = self.model.encode(request.Embeddings)
        # Check if we have one result at least
        if len(outputs) == 0:
            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
            context.set_details("No embeddings were calculated.")
            return backend_pb2.EmbeddingResult()
        return backend_pb2.EmbeddingResult(embeddings=outputs[0].outputs.embedding)
    async def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@ -74,3 +74,26 @@ class TestBackendServicer(unittest.TestCase):
            self.fail("text service failed")
        finally:
            self.tearDown()
    def test_embedding(self):
        """
        This method tests if the embeddings are generated successfully
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="intfloat/e5-mistral-7b-instruct"))
                self.assertTrue(response.success)
                embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
                embedding_response = stub.Embedding(embedding_request)
                self.assertIsNotNone(embedding_response.embeddings)
                # assert that is a list of floats
                self.assertIsInstance(embedding_response.embeddings, list)
                # assert that the list is not empty
                self.assertTrue(len(embedding_response.embeddings) > 0)
        except Exception as err:
            print(err)
            self.fail("Embedding service failed")
        finally:
            self.tearDown()