feat: auto select llama-cpp cuda runtime (#2306)

* auto select cpu variant Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * remove cuda target for now Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * fix metal Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * fix path Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * cuda Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * auto select cuda Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * update test Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * select CUDA backend only if present Signed-off-by: mudler <mudler@localai.io> * ci: keep cuda bin in path Signed-off-by: mudler <mudler@localai.io> * Makefile: make dist now builds also cuda Signed-off-by: mudler <mudler@localai.io> * Keep pushing fallback in case auto-flagset/nvidia fails There could be other reasons for which the default binary may fail. For example we might have detected an Nvidia GPU, however the user might not have the drivers/cuda libraries installed in the system, and so it would fail to start. We keep the fallback of llama.cpp at the end of the llama.cpp backends to try to fallback loading in case things go wrong Signed-off-by: mudler <mudler@localai.io> * Do not build cuda on MacOS Signed-off-by: mudler <mudler@localai.io> * cleanup Signed-off-by: Sertac Ozercan <sozercan@gmail.com> * Apply suggestions from code review Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> --------- Signed-off-by: Sertac Ozercan <sozercan@gmail.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Signed-off-by: mudler <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: mudler <mudler@localai.io>
2025-05-20 02:24:59 +00:00 · 2024-05-14 10:40:18 -07:00 · 2024-05-14 10:40:18 -07:00 · a670318a9f
commit a670318a9f
parent 84e2407afa
5 changed files with 62 additions and 58 deletions
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -11,6 +11,7 @@ import (
 	"time"

 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	"github.com/go-skynet/LocalAI/pkg/xsysinfo"
 	"github.com/phayes/freeport"
 	"github.com/rs/zerolog/log"
 	"golang.org/x/sys/cpu"
@ -29,10 +30,12 @@ const (
 	LlamaGGML = "llama-ggml"

 	LLamaCPP         = "llama-cpp"
+
 	LLamaCPPCUDA12   = "llama-cpp-cuda12"
 	LLamaCPPAVX2     = "llama-cpp-avx2"
 	LLamaCPPAVX      = "llama-cpp-avx"
 	LLamaCPPFallback = "llama-cpp-fallback"
+	LLamaCPPCUDA     = "llama-cpp-cuda"

 	Gpt4AllLlamaBackend = "gpt4all-llama"
 	Gpt4AllMptBackend   = "gpt4all-mpt"
@ -72,8 +75,7 @@ ENTRY:
 			}
 		}
 		if !e.IsDir() {
-			//backends = append(backends, e.Name())
-			if !strings.Contains(e.Name(), LLamaCPP) {
+			if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
 				backends[e.Name()] = []string{}
 			}
 		}
@ -104,7 +106,7 @@ ENTRY:
 	// First has more priority
 	priorityList := []string{
 		// First llama.cpp and llama-ggml
-		LLamaCPP, LlamaGGML, Gpt4All,
+		LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
 	}

 	toTheEnd := []string{
@ -190,17 +192,33 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
 		} else {
 			grpcProcess := backendPath(o.assetDir, backend)

+			foundCUDA := false
 			// for llama-cpp, check CPU capabilities and load the appropriate variant
 			if backend == LLamaCPP {
-				if cpu.X86.HasAVX2 {
-					log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
-					grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
-				} else if cpu.X86.HasAVX {
-					log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
-					grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
-				} else {
-					log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
-					grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
+				gpus, err := xsysinfo.GPUs()
+				if err == nil {
+					for _, gpu := range gpus {
+						if strings.Contains(gpu.String(), "nvidia") {
+							log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
+							grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
+							if _, err := os.Stat(grpcProcess); err == nil {
+								foundCUDA = true
+							}
+						}
+					}
+				}
+
+				if !foundCUDA {
+					if cpu.X86.HasAVX2 {
+						log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
+						grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
+					} else if cpu.X86.HasAVX {
+						log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
+						grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
+					} else {
+						log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
+						grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
+					}
 				}
 			}