feat(intel): add diffusers/transformers support (#1746)

* feat(intel): add diffusers support

* try to consume upstream container image

* Debug

* Manually install deps

* Map transformers/hf cache dir to modelpath if not specified

* fix(compel): update initialization, pass by all gRPC options

* fix: add dependencies, implement transformers for xpu

* base it from the oneapi image

* Add pillow

* set threads if specified when launching the API

* Skip conda install if intel

* defaults to non-intel

* ci: add to pipelines

* prepare compel only if enabled

* Skip conda install if intel

* fix cleanup

* Disable compel by default

* Install torch 2.1.0 with Intel

* Skip conda on some setups

* Detect python

* Quiet output

* Do not override system python with conda

* Prefer python3

* Fixups

* exllama2: do not install without conda (overrides pytorch version)

* exllama/exllama2: do not install if not using cuda

* Add missing dataset dependency

* Small fixups, symlink to python, add requirements

* Add neural_speed to the deps

* correctly handle model offloading

* fix: device_map == xpu

* go back at calling python, fixed at dockerfile level

* Exllama2 restricted to only nvidia gpus

* Tokenizer to xpu
This commit is contained in:
Ettore Di Giacinto 2024-03-07 14:37:45 +01:00 committed by GitHub
parent ad6fd7a991
commit 5d1018495f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 250 additions and 81 deletions

View file

@ -8,27 +8,18 @@ import (
)
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
threads := backendConfig.Threads
if threads == 0 && appConfig.Threads != 0 {
threads = appConfig.Threads
}
gRPCOpts := gRPCModelOpts(backendConfig)
opts := modelOpts(backendConfig, appConfig, []model.Option{
model.WithBackendString(backendConfig.Backend),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithThreads(uint32(backendConfig.Threads)),
model.WithThreads(uint32(threads)),
model.WithContext(appConfig.Context),
model.WithModel(backendConfig.Model),
model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
CUDA: backendConfig.CUDA || backendConfig.Diffusers.CUDA,
SchedulerType: backendConfig.Diffusers.SchedulerType,
PipelineType: backendConfig.Diffusers.PipelineType,
CFGScale: backendConfig.Diffusers.CFGScale,
LoraAdapter: backendConfig.LoraAdapter,
LoraScale: backendConfig.LoraScale,
LoraBase: backendConfig.LoraBase,
IMG2IMG: backendConfig.Diffusers.IMG2IMG,
CLIPModel: backendConfig.Diffusers.ClipModel,
CLIPSubfolder: backendConfig.Diffusers.ClipSubFolder,
CLIPSkip: int32(backendConfig.Diffusers.ClipSkip),
ControlNet: backendConfig.Diffusers.ControlNet,
}),
model.WithLoadGRPCLoadModelOpts(gRPCOpts),
})
inferenceModel, err := loader.BackendLoader(

View file

@ -28,7 +28,10 @@ type TokenUsage struct {
func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
threads := c.Threads
if threads == 0 && o.Threads != 0 {
threads = o.Threads
}
grpcOpts := gRPCModelOpts(c)
var inferenceModel grpc.Backend
@ -36,7 +39,7 @@ func ModelInference(ctx context.Context, s string, images []string, loader *mode
opts := modelOpts(c, o, []model.Option{
model.WithLoadGRPCLoadModelOpts(grpcOpts),
model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
model.WithThreads(uint32(threads)), // some models uses this to allocate threads during startup
model.WithAssetDir(o.AssetsDestination),
model.WithModel(modelFile),
model.WithContext(o.Context),

View file

@ -40,11 +40,23 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
}
return &pb.ModelOptions{
CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.Diffusers.CFGScale,
LoraAdapter: c.LoraAdapter,
LoraScale: c.LoraScale,
F16Memory: c.F16,
LoraBase: c.LoraBase,
IMG2IMG: c.Diffusers.IMG2IMG,
CLIPModel: c.Diffusers.ClipModel,
CLIPSubfolder: c.Diffusers.ClipSubFolder,
CLIPSkip: int32(c.Diffusers.ClipSkip),
ControlNet: c.Diffusers.ControlNet,
ContextSize: int32(c.ContextSize),
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
CUDA: c.CUDA, // diffusers, transformers
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
Quantization: c.Quantization,
@ -58,12 +70,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,
YarnBetaSlow: c.YarnBetaSlow,
LoraAdapter: c.LoraAdapter,
LoraBase: c.LoraBase,
LoraScale: c.LoraScale,
NGQA: c.NGQA,
RMSNormEps: c.RMSNormEps,
F16Memory: c.F16,
MLock: c.MMlock,
RopeFreqBase: c.RopeFreqBase,
RopeScaling: c.RopeScaling,