feat(intel): add diffusers/transformers support (#1746)

* feat(intel): add diffusers support * try to consume upstream container image * Debug * Manually install deps * Map transformers/hf cache dir to modelpath if not specified * fix(compel): update initialization, pass by all gRPC options * fix: add dependencies, implement transformers for xpu * base it from the oneapi image * Add pillow * set threads if specified when launching the API * Skip conda install if intel * defaults to non-intel * ci: add to pipelines * prepare compel only if enabled * Skip conda install if intel * fix cleanup * Disable compel by default * Install torch 2.1.0 with Intel * Skip conda on some setups * Detect python * Quiet output * Do not override system python with conda * Prefer python3 * Fixups * exllama2: do not install without conda (overrides pytorch version) * exllama/exllama2: do not install if not using cuda * Add missing dataset dependency * Small fixups, symlink to python, add requirements * Add neural_speed to the deps * correctly handle model offloading * fix: device_map == xpu * go back at calling python, fixed at dockerfile level * Exllama2 restricted to only nvidia gpus * Tokenizer to xpu
2025-05-20 10:35:01 +00:00 · 2024-03-07 14:37:45 +01:00 · 2024-03-07 14:37:45 +01:00 · 5d1018495f
commit 5d1018495f
parent ad6fd7a991
23 changed files with 250 additions and 81 deletions
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@ -3,10 +3,16 @@
 ##
 ## A bash script wrapper that runs the transformers server with conda

-export PATH=$PATH:/opt/conda/bin

-# Activate conda environment
-source activate transformers
+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+else
+    export PATH=$PATH:/opt/conda/bin
+    # Activate conda environment
+    source activate transformers
+fi

 # get the directory where the bash script is located
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
--- a/backend/python/transformers/transformers_server.py
+++ b/backend/python/transformers/transformers_server.py
@ -16,7 +16,15 @@ import backend_pb2_grpc
 import grpc
 import torch
 import torch.cuda
-from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+
+XPU=os.environ.get("XPU", "0") == "1"
+if XPU:
+    import intel_extension_for_pytorch as ipex
+    from intel_extension_for_transformers.transformers.modeling import AutoModelForCausalLM
+    from transformers import AutoTokenizer, AutoModel, set_seed
+else:
+    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed
+

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24

@ -69,12 +77,25 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        model_name = request.Model
        try:
            if request.Type == "AutoModelForCausalLM":
-                self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
+                if XPU:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode,
+                                              device_map="xpu", load_in_4bit=True)
+                else:
+                    self.model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)
            else:
                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode)

            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.CUDA = False
+            self.XPU = False
+
+            if XPU:
+                self.XPU = True
+                try:
+                    print("Optimizing model", model_name, "to XPU.", file=sys.stderr)
+                    self.model = ipex.optimize_transformers(self.model, inplace=True, dtype=torch.float16, device="xpu")
+                except Exception as err:
+                    print("Not using XPU:", err, file=sys.stderr)

            if request.CUDA or torch.cuda.is_available():
                try:
@ -139,6 +160,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        inputs = self.tokenizer(request.Prompt, return_tensors="pt").input_ids
        if self.CUDA:
            inputs = inputs.to("cuda")
+        if XPU:
+            inputs = inputs.to("xpu")

        outputs = self.model.generate(inputs,max_new_tokens=max_tokens, temperature=request.Temperature, top_p=request.TopP)