feat(llama.cpp): add distributed llama.cpp inferencing (#2324)

* feat(llama.cpp): support distributed llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat: let tweak how chat messages are merged together

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactor

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Makefile: register to ALL_GRPC_BACKENDS

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring, allow disable auto-detection of backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* minor fixups

Signed-off-by: mudler <mudler@localai.io>

* feat: add cmd to start rpc-server from llama.cpp

Signed-off-by: mudler <mudler@localai.io>

* ci: add ccache

Signed-off-by: mudler <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Signed-off-by: mudler <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-05-15 01:17:02 +02:00 committed by GitHub
parent 29909666c3
commit c89271b2e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 222 additions and 82 deletions

View file

@ -12,9 +12,9 @@ import (
grpc "github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
"github.com/klauspost/cpuid/v2"
"github.com/phayes/freeport"
"github.com/rs/zerolog/log"
"golang.org/x/sys/cpu"
"github.com/elliotchance/orderedmap/v2"
)
@ -26,16 +26,18 @@ var Aliases map[string]string = map[string]string{
"langchain-huggingface": LCHuggingFaceBackend,
}
var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
const (
LlamaGGML = "llama-ggml"
LLamaCPP = "llama-cpp"
LLamaCPP = "llama-cpp"
LLamaCPPCUDA12 = "llama-cpp-cuda12"
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"
LLamaCPPGRPC = "llama-cpp-grpc"
Gpt4AllLlamaBackend = "gpt4all-llama"
Gpt4AllMptBackend = "gpt4all-mpt"
@ -59,7 +61,7 @@ func backendPath(assetDir, backend string) string {
// backendsInAssetDir returns the list of backends in the asset directory
// that should be loaded
func backendsInAssetDir(assetDir string) (*orderedmap.OrderedMap[string, any], error) {
func backendsInAssetDir(assetDir string) ([]string, error) {
// Exclude backends from automatic loading
excludeBackends := []string{LocalStoreBackend}
entry, err := os.ReadDir(backendPath(assetDir, ""))
@ -74,27 +76,46 @@ ENTRY:
continue ENTRY
}
}
if !e.IsDir() {
if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
backends[e.Name()] = []string{}
}
if e.IsDir() {
continue
}
// Skip the llama.cpp variants if we are autoDetecting
// But we always load the fallback variant if it exists
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
continue
}
backends[e.Name()] = []string{}
}
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback := false, false, false
if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry {
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
foundLCPPAVX2 = true
}
if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
foundLCPPAVX = true
}
if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
foundLCPPFallback = true
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
if autoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda := false, false, false, false, false
if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry {
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX2)
foundLCPPAVX2 = true
}
if strings.Contains(e.Name(), LLamaCPPAVX) && !foundLCPPAVX {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPAVX)
foundLCPPAVX = true
}
if strings.Contains(e.Name(), LLamaCPPFallback) && !foundLCPPFallback {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPFallback)
foundLCPPFallback = true
}
if strings.Contains(e.Name(), LLamaCPPGRPC) && !foundLCPPGRPC {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPGRPC)
foundLCPPGRPC = true
}
if strings.Contains(e.Name(), LLamaCPPCUDA) && !foundLCPPCuda {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPCUDA)
foundLCPPCuda = true
}
}
}
}
@ -102,10 +123,13 @@ ENTRY:
// order backends from the asset directory.
// as we scan for backends, we want to keep some order which backends are tried of.
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
// sets a priority list
// First has more priority
// sets a priority list - first has more priority
priorityList := []string{
// First llama.cpp and llama-ggml
// First llama.cpp(variants) and llama-ggml to follow.
// We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net.
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
}
@ -139,7 +163,57 @@ ENTRY:
}
}
return orderedBackends, nil
return orderedBackends.Keys(), nil
}
// selectGRPCProcess selects the GRPC process to start based on system capabilities
func selectGRPCProcess(backend, assetDir string) string {
foundCUDA := false
var grpcProcess string
// Select backend now just for llama.cpp
if backend != LLamaCPP {
return ""
}
// Note: This environment variable is read by the LocalAI's llama.cpp grpc-server
if os.Getenv("LLAMACPP_GRPC_SERVERS") != "" {
log.Info().Msgf("[%s] attempting to load with GRPC variant", LLamaCPPGRPC)
return backendPath(assetDir, LLamaCPPGRPC)
}
gpus, err := xsysinfo.GPUs()
if err == nil {
for _, gpu := range gpus {
if strings.Contains(gpu.String(), "nvidia") {
p := backendPath(assetDir, LLamaCPPCUDA)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
grpcProcess = p
foundCUDA = true
} else {
log.Info().Msgf("GPU device found but no CUDA backend present")
}
}
}
}
if foundCUDA {
return grpcProcess
}
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(assetDir, LLamaCPPAVX2)
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(assetDir, LLamaCPPFallback)
}
return grpcProcess
}
// starts the grpcModelProcess for the backend, and returns a grpc client
@ -192,33 +266,10 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
} else {
grpcProcess := backendPath(o.assetDir, backend)
foundCUDA := false
// for llama-cpp, check CPU capabilities and load the appropriate variant
if backend == LLamaCPP {
gpus, err := xsysinfo.GPUs()
if err == nil {
for _, gpu := range gpus {
if strings.Contains(gpu.String(), "nvidia") {
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
if _, err := os.Stat(grpcProcess); err == nil {
foundCUDA = true
}
}
}
}
if !foundCUDA {
if cpu.X86.HasAVX2 {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
} else if cpu.X86.HasAVX {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
}
if autoDetect {
// autoDetect GRPC process to start based on system capabilities
if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
grpcProcess = selectedProcess
}
}
@ -363,28 +414,24 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
var err error
// autoload also external backends
allBackendsToAutoLoad := orderedmap.NewOrderedMap[string, any]()
// get backends embedded in the binary
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
if err != nil {
return nil, err
}
// append externalBackends supplied by the user via the CLI
for _, b := range o.externalBackends {
autoLoadBackends = append(autoLoadBackends, b)
}
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
for _, k := range autoLoadBackends.Keys() {
v, _ := autoLoadBackends.Get(k)
allBackendsToAutoLoad.Set(k, v)
}
for _, b := range o.externalBackends {
allBackendsToAutoLoad.Set(b, []string{})
}
if o.model != "" {
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, allBackendsToAutoLoad.Keys())
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends)
}
for _, key := range allBackendsToAutoLoad.Keys() {
for _, key := range autoLoadBackends {
log.Info().Msgf("[%s] Attempting to load", key)
options := []Option{
WithBackendString(key),