mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
feat(intel): add diffusers/transformers support (#1746)
* feat(intel): add diffusers support * try to consume upstream container image * Debug * Manually install deps * Map transformers/hf cache dir to modelpath if not specified * fix(compel): update initialization, pass by all gRPC options * fix: add dependencies, implement transformers for xpu * base it from the oneapi image * Add pillow * set threads if specified when launching the API * Skip conda install if intel * defaults to non-intel * ci: add to pipelines * prepare compel only if enabled * Skip conda install if intel * fix cleanup * Disable compel by default * Install torch 2.1.0 with Intel * Skip conda on some setups * Detect python * Quiet output * Do not override system python with conda * Prefer python3 * Fixups * exllama2: do not install without conda (overrides pytorch version) * exllama/exllama2: do not install if not using cuda * Add missing dataset dependency * Small fixups, symlink to python, add requirements * Add neural_speed to the deps * correctly handle model offloading * fix: device_map == xpu * go back at calling python, fixed at dockerfile level * Exllama2 restricted to only nvidia gpus * Tokenizer to xpu
This commit is contained in:
parent
ad6fd7a991
commit
5d1018495f
23 changed files with 250 additions and 81 deletions
|
@ -3,10 +3,16 @@
|
|||
##
|
||||
## A bash script wrapper that runs the transformers server with conda
|
||||
|
||||
export PATH=$PATH:/opt/conda/bin
|
||||
|
||||
# Activate conda environment
|
||||
source activate transformers
|
||||
if [ -d "/opt/intel" ]; then
|
||||
# Assumes we are using the Intel oneAPI container image
|
||||
# https://github.com/intel/intel-extension-for-pytorch/issues/538
|
||||
export XPU=1
|
||||
else
|
||||
export PATH=$PATH:/opt/conda/bin
|
||||
# Activate conda environment
|
||||
source activate transformers
|
||||
fi
|
||||
|
||||
# get the directory where the bash script is located
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
|
|
@ -16,7 +16,15 @@ import backend_pb2_grpc
|
|||
import grpc
|
||||
import torch
|
||||
import torch.cuda
|
||||
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
|
||||
|
||||
XPU=os.environ.get("XPU", "0") == "1"
|
||||
if XPU:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer, AutoModel, set_seed
|
||||
else:
|
||||
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
|
||||
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
|
@ -69,12 +77,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
model_name = request.Model
|
||||
try:
|
||||
if request.Type == "AutoModelForCausalLM":
|
||||
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
if XPU:
|
||||
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
|
||||
device_map="xpu", load_in_4bit=True)
|
||||
else:
|
||||
self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
else:
|
||||
self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
self.CUDA = False
|
||||
self.XPU = False
|
||||
|
||||
if XPU:
|
||||
self.XPU = True
|
||||
try:
|
||||
print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
|
||||
self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
|
||||
except Exception as err:
|
||||
print("Not using XPU:", err, file=sys.stderr)
|
||||
|
||||
if request.CUDA or torch.cuda.is_available():
|
||||
try:
|
||||
|
@ -139,6 +160,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|||
inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
|
||||
if self.CUDA:
|
||||
inputs = inputs.to("cuda")
|
||||
if XPU:
|
||||
inputs = inputs.to("xpu")
|
||||
|
||||
outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue