mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-21 19:15:00 +00:00
Transformer Backend: Implementing use_tokenizer_template and stop_prompts options (#2090)
* fix regression #1971 fixes regression #1971 introduced by intel_extension_for_transformers==1.4 * UseTokenizerTemplate and StopPrompt Implementation of use_tokenizer_template and stopwords options
This commit is contained in:
parent
39814cab32
commit
66b002458d
1 changed files with 18 additions and 4 deletions
|
@ -148,7 +148,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
else:
|
else:
|
||||||
device_map="CPU"
|
device_map="CPU"
|
||||||
self.model = OVModelForCausalLM.from_pretrained(model_name,
|
self.model = OVModelForCausalLM.from_pretrained(model_name,
|
||||||
compile=True,
|
compile=True,
|
||||||
|
ov_config={"PERFORMANCE_HINT": "LATENCY"},
|
||||||
device=device_map)
|
device=device_map)
|
||||||
self.OV = True
|
self.OV = True
|
||||||
else:
|
else:
|
||||||
|
@ -212,12 +213,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
set_seed(request.Seed)
|
set_seed(request.Seed)
|
||||||
if request.TopP == 0:
|
if request.TopP == 0:
|
||||||
request.TopP = 0.9
|
request.TopP = 0.9
|
||||||
|
|
||||||
|
if request.TopK == 0:
|
||||||
|
request.TopK = 40
|
||||||
|
|
||||||
max_tokens = 200
|
max_tokens = 200
|
||||||
if request.Tokens > 0:
|
if request.Tokens > 0:
|
||||||
max_tokens = request.Tokens
|
max_tokens = request.Tokens
|
||||||
|
|
||||||
inputs = self.tokenizer(request.Prompt, return_tensors="pt")
|
prompt = request.Prompt
|
||||||
|
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||||
|
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||||
|
|
||||||
|
eos_token_id = self.tokenizer.eos_token_id
|
||||||
|
if request.StopPrompts:
|
||||||
|
eos_token_id = []
|
||||||
|
for word in request.StopPrompts:
|
||||||
|
eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
|
||||||
|
|
||||||
|
inputs = self.tokenizer(prompt, return_tensors="pt")
|
||||||
if self.CUDA:
|
if self.CUDA:
|
||||||
inputs = inputs.to("cuda")
|
inputs = inputs.to("cuda")
|
||||||
if XPU and self.OV == False:
|
if XPU and self.OV == False:
|
||||||
|
@ -235,7 +249,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
top_k=request.TopK,
|
top_k=request.TopK,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
attention_mask=inputs["attention_mask"],
|
attention_mask=inputs["attention_mask"],
|
||||||
eos_token_id=self.tokenizer.eos_token_id,
|
eos_token_id=eos_token_id,
|
||||||
pad_token_id=self.tokenizer.eos_token_id,
|
pad_token_id=self.tokenizer.eos_token_id,
|
||||||
streamer=streamer)
|
streamer=streamer)
|
||||||
thread=Thread(target=self.model.generate, kwargs=config)
|
thread=Thread(target=self.model.generate, kwargs=config)
|
||||||
|
@ -264,7 +278,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
top_k=request.TopK,
|
top_k=request.TopK,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
attention_mask=inputs["attention_mask"],
|
attention_mask=inputs["attention_mask"],
|
||||||
eos_token_id=self.tokenizer.eos_token_id,
|
eos_token_id=eos_token_id,
|
||||||
pad_token_id=self.tokenizer.eos_token_id)
|
pad_token_id=self.tokenizer.eos_token_id)
|
||||||
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
|
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue