feat: auto select llama-cpp cuda runtime (#2306)

* auto select cpu variant

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* remove cuda target for now

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* fix metal

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* fix path

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* cuda

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* auto select cuda

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* update test

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* select CUDA backend only if present

Signed-off-by: mudler <mudler@localai.io>

* ci: keep cuda bin in path

Signed-off-by: mudler <mudler@localai.io>

* Makefile: make dist now builds also cuda

Signed-off-by: mudler <mudler@localai.io>

* Keep pushing fallback in case auto-flagset/nvidia fails

There could be other reasons for which the default binary may fail. For example we might have detected an Nvidia GPU,
however the user might not have the drivers/cuda libraries installed in the system, and so it would fail to start.

We keep the fallback of llama.cpp at the end of the llama.cpp backends to try to fallback loading in case things go wrong

Signed-off-by: mudler <mudler@localai.io>

* Do not build cuda on MacOS

Signed-off-by: mudler <mudler@localai.io>

* cleanup

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>

* Apply suggestions from code review

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

---------

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Signed-off-by: mudler <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: mudler <mudler@localai.io>
This commit is contained in:
Sertaç Özercan 2024-05-14 10:40:18 -07:00 committed by GitHub
parent 84e2407afa
commit a670318a9f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 62 additions and 58 deletions

View file

@ -11,6 +11,7 @@ import (
"time"
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
"github.com/phayes/freeport"
"github.com/rs/zerolog/log"
"golang.org/x/sys/cpu"
@ -29,10 +30,12 @@ const (
LlamaGGML = "llama-ggml"
LLamaCPP = "llama-cpp"
LLamaCPPCUDA12 = "llama-cpp-cuda12"
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"
Gpt4AllLlamaBackend = "gpt4all-llama"
Gpt4AllMptBackend = "gpt4all-mpt"
@ -72,8 +75,7 @@ ENTRY:
}
}
if !e.IsDir() {
//backends = append(backends, e.Name())
if !strings.Contains(e.Name(), LLamaCPP) {
if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
backends[e.Name()] = []string{}
}
}
@ -104,7 +106,7 @@ ENTRY:
// First has more priority
priorityList := []string{
// First llama.cpp and llama-ggml
LLamaCPP, LlamaGGML, Gpt4All,
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
}
toTheEnd := []string{
@ -190,17 +192,33 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
} else {
grpcProcess := backendPath(o.assetDir, backend)
foundCUDA := false
// for llama-cpp, check CPU capabilities and load the appropriate variant
if backend == LLamaCPP {
if cpu.X86.HasAVX2 {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
} else if cpu.X86.HasAVX {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
gpus, err := xsysinfo.GPUs()
if err == nil {
for _, gpu := range gpus {
if strings.Contains(gpu.String(), "nvidia") {
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
if _, err := os.Stat(grpcProcess); err == nil {
foundCUDA = true
}
}
}
}
if !foundCUDA {
if cpu.X86.HasAVX2 {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
} else if cpu.X86.HasAVX {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
}
}
}