mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
feat(transformers): merge sentencetransformers backend (#4624)
* merge sentencetransformers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add alias to silently redirect sentencetransformers to transformers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add alias also for transformers-musicgen Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop from makefile Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Move tests from sentencetransformers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove sentencetransformers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove tests from CI (part of transformers) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Do not always try to load the tokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Adapt tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fix typo Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Tiny adjustments Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
4bd8434ae0
commit
1e9bf19c8d
27 changed files with 104 additions and 354 deletions
|
@ -25,6 +25,8 @@ from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreame
|
|||
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
||||
from scipy.io import wavfile
|
||||
import outetts
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
|
@ -88,10 +90,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
self.CUDA = torch.cuda.is_available()
|
||||
self.OV=False
|
||||
self.OuteTTS=False
|
||||
self.SentenceTransformer = False
|
||||
|
||||
device_map="cpu"
|
||||
|
||||
quantization = None
|
||||
autoTokenizer = True
|
||||
|
||||
if self.CUDA:
|
||||
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
|
||||
|
@ -195,9 +199,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
device=device_map)
|
||||
self.OV = True
|
||||
elif request.Type == "MusicgenForConditionalGeneration":
|
||||
autoTokenizer = False
|
||||
self.processor = AutoProcessor.from_pretrained(model_name)
|
||||
self.model = MusicgenForConditionalGeneration.from_pretrained(model_name)
|
||||
elif request.Type == "OuteTTS":
|
||||
autoTokenizer = False
|
||||
options = request.Options
|
||||
MODELNAME = "OuteAI/OuteTTS-0.3-1B"
|
||||
TOKENIZER = "OuteAI/OuteTTS-0.3-1B"
|
||||
|
@ -235,6 +241,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
self.speaker = self.interface.create_speaker(audio_path=self.AudioPath)
|
||||
else:
|
||||
self.speaker = self.interface.load_default_speaker(name=SPEAKER)
|
||||
elif request.Type == "SentenceTransformer":
|
||||
autoTokenizer = False
|
||||
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
self.SentenceTransformer = True
|
||||
else:
|
||||
print("Automodel", file=sys.stderr)
|
||||
self.model = AutoModel.from_pretrained(model_name,
|
||||
|
@ -250,7 +260,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
else:
|
||||
self.max_tokens = 512
|
||||
|
||||
if request.Type != "MusicgenForConditionalGeneration":
|
||||
if autoTokenizer:
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_safetensors=True)
|
||||
self.XPU = False
|
||||
|
||||
|
@ -286,18 +296,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
max_length = 512
|
||||
if request.Tokens != 0:
|
||||
max_length = request.Tokens
|
||||
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
|
||||
|
||||
# Create word embeddings
|
||||
if self.CUDA:
|
||||
encoded_input = encoded_input.to("cuda")
|
||||
embeds = None
|
||||
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
if self.SentenceTransformer:
|
||||
print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
|
||||
embeds = self.model.encode(request.Embeddings)
|
||||
else:
|
||||
encoded_input = self.tokenizer(request.Embeddings, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
|
||||
|
||||
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
|
||||
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
||||
return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])
|
||||
# Create word embeddings
|
||||
if self.CUDA:
|
||||
encoded_input = encoded_input.to("cuda")
|
||||
|
||||
with torch.no_grad():
|
||||
model_output = self.model(**encoded_input)
|
||||
|
||||
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
|
||||
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
||||
embeds = sentence_embeddings[0]
|
||||
return backend_pb2.EmbeddingResult(embeddings=embeds)
|
||||
|
||||
async def _predict(self, request, context, streaming=False):
|
||||
set_seed(request.Seed)
|
||||
|
|
|
@ -3,4 +3,5 @@ llvmlite==0.43.0
|
|||
accelerate
|
||||
transformers
|
||||
bitsandbytes
|
||||
outetts
|
||||
outetts
|
||||
sentence-transformers==3.3.1
|
||||
|
|
|
@ -4,4 +4,5 @@ llvmlite==0.43.0
|
|||
accelerate
|
||||
transformers
|
||||
bitsandbytes
|
||||
outetts
|
||||
outetts
|
||||
sentence-transformers==3.3.1
|
||||
|
|
|
@ -3,4 +3,5 @@ accelerate
|
|||
llvmlite==0.43.0
|
||||
transformers
|
||||
bitsandbytes
|
||||
outetts
|
||||
outetts
|
||||
sentence-transformers==3.3.1
|
||||
|
|
|
@ -4,4 +4,6 @@ accelerate
|
|||
transformers
|
||||
llvmlite==0.43.0
|
||||
bitsandbytes
|
||||
outetts
|
||||
outetts
|
||||
bitsandbytes
|
||||
sentence-transformers==3.3.1
|
||||
|
|
|
@ -6,4 +6,5 @@ optimum[openvino]
|
|||
llvmlite==0.43.0
|
||||
intel-extension-for-transformers
|
||||
bitsandbytes
|
||||
outetts
|
||||
outetts
|
||||
sentence-transformers==3.3.1
|
||||
|
|
|
@ -133,5 +133,41 @@ class TestBackendServicer(unittest.TestCase):
|
|||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("SoundGeneration service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_embed_load_model(self):
|
||||
"""
|
||||
This method tests if the model is loaded successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens",Type="SentenceTransformer"))
|
||||
self.assertTrue(response.success)
|
||||
self.assertEqual(response.message, "Model loaded successfully")
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("LoadModel service failed")
|
||||
finally:
|
||||
self.tearDown()
|
||||
|
||||
def test_sentencetransformers_embedding(self):
|
||||
"""
|
||||
This method tests if the embeddings are generated successfully
|
||||
"""
|
||||
try:
|
||||
self.setUp()
|
||||
with grpc.insecure_channel("localhost:50051") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(backend_pb2.ModelOptions(Model="bert-base-nli-mean-tokens",Type="SentenceTransformer"))
|
||||
self.assertTrue(response.success)
|
||||
embedding_request = backend_pb2.PredictOptions(Embeddings="This is a test sentence.")
|
||||
embedding_response = stub.Embedding(embedding_request)
|
||||
self.assertIsNotNone(embedding_response.embeddings)
|
||||
except Exception as err:
|
||||
print(err)
|
||||
self.fail("Embedding service failed")
|
||||
finally:
|
||||
self.tearDown()
|
Loading…
Add table
Add a link
Reference in a new issue