feat: add falcon ggllm via grpc client

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-21 19:15:00 +00:00 · 2023-07-15 01:19:43 +02:00 · 2023-07-15 01:19:43 +02:00 · b816009db0
commit b816009db0
parent a84dee1be1
20 changed files with 2239 additions and 271 deletions
--- a/api/prediction.go
+++ b/api/prediction.go
@ -1,6 +1,7 @@
 package api

 import (
+	"context"
 	"fmt"
 	"os"
 	"path/filepath"
@ -9,6 +10,8 @@ import (
 	"sync"

 	"github.com/donomii/go-rwkv.cpp"
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/LocalAI/pkg/langchain"
 	model "github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
@ -16,6 +19,7 @@ import (
 	bert "github.com/go-skynet/go-bert.cpp"
 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 	llama "github.com/go-skynet/go-llama.cpp"
+
 	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
 )

@ -23,6 +27,160 @@ import (
 var mutexMap sync.Mutex
 var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)

+func gRPCModelOpts(c Config) *pb.ModelOptions {
+	b := 512
+	if c.Batch != 0 {
+		b = c.Batch
+	}
+	return &pb.ModelOptions{
+		ContextSize: int32(c.ContextSize),
+		Seed:        int32(c.Seed),
+		NBatch:      int32(b),
+		NGPULayers:  int32(c.NGPULayers),
+		MMap:        c.MMap,
+		MainGPU:     c.MainGPU,
+		TensorSplit: c.TensorSplit,
+	}
+}
+
+// func defaultGGLLMOpts(c Config) []ggllm.ModelOption {
+// 	ggllmOpts := []ggllm.ModelOption{}
+// 	if c.ContextSize != 0 {
+// 		ggllmOpts = append(ggllmOpts, ggllm.SetContext(c.ContextSize))
+// 	}
+// 	// F16 doesn't seem to produce good output at all!
+// 	//if c.F16 {
+// 	//	llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+// 	//}
+
+// 	if c.NGPULayers != 0 {
+// 		ggllmOpts = append(ggllmOpts, ggllm.SetGPULayers(c.NGPULayers))
+// 	}
+
+// 	ggllmOpts = append(ggllmOpts, ggllm.SetMMap(c.MMap))
+// 	ggllmOpts = append(ggllmOpts, ggllm.SetMainGPU(c.MainGPU))
+// 	ggllmOpts = append(ggllmOpts, ggllm.SetTensorSplit(c.TensorSplit))
+// 	if c.Batch != 0 {
+// 		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(c.Batch))
+// 	} else {
+// 		ggllmOpts = append(ggllmOpts, ggllm.SetNBatch(512))
+// 	}
+
+// 	return ggllmOpts
+// }
+
+func gRPCPredictOpts(c Config, modelPath string) *pb.PredictOptions {
+	promptCachePath := ""
+	if c.PromptCachePath != "" {
+		p := filepath.Join(modelPath, c.PromptCachePath)
+		os.MkdirAll(filepath.Dir(p), 0755)
+		promptCachePath = p
+	}
+	return &pb.PredictOptions{
+		Temperature:       float32(c.Temperature),
+		TopP:              float32(c.TopP),
+		TopK:              int32(c.TopK),
+		Tokens:            int32(c.Maxtokens),
+		Threads:           int32(c.Threads),
+		PromptCacheAll:    c.PromptCacheAll,
+		PromptCacheRO:     c.PromptCacheRO,
+		PromptCachePath:   promptCachePath,
+		Mirostat:          int32(c.Mirostat),
+		MirostatETA:       float32(c.MirostatETA),
+		MirostatTAU:       float32(c.MirostatTAU),
+		Debug:             c.Debug,
+		StopPrompts:       c.StopWords,
+		Repeat:            int32(c.RepeatPenalty),
+		NKeep:             int32(c.Keep),
+		Batch:             int32(c.Batch),
+		IgnoreEOS:         c.IgnoreEOS,
+		Seed:              int32(c.Seed),
+		FrequencyPenalty:  float32(c.FrequencyPenalty),
+		MLock:             c.MMlock,
+		MMap:              c.MMap,
+		MainGPU:           c.MainGPU,
+		TensorSplit:       c.TensorSplit,
+		TailFreeSamplingZ: float32(c.TFZ),
+		TypicalP:          float32(c.TypicalP),
+	}
+}
+
+// func buildGGLLMPredictOptions(c Config, modelPath string) []ggllm.PredictOption {
+// 	// Generate the prediction using the language model
+// 	predictOptions := []ggllm.PredictOption{
+// 		ggllm.SetTemperature(c.Temperature),
+// 		ggllm.SetTopP(c.TopP),
+// 		ggllm.SetTopK(c.TopK),
+// 		ggllm.SetTokens(c.Maxtokens),
+// 		ggllm.SetThreads(c.Threads),
+// 	}
+
+// 	if c.PromptCacheAll {
+// 		predictOptions = append(predictOptions, ggllm.EnablePromptCacheAll)
+// 	}
+
+// 	if c.PromptCacheRO {
+// 		predictOptions = append(predictOptions, ggllm.EnablePromptCacheRO)
+// 	}
+
+// 	if c.PromptCachePath != "" {
+// 		// Create parent directory
+// 		p := filepath.Join(modelPath, c.PromptCachePath)
+// 		os.MkdirAll(filepath.Dir(p), 0755)
+// 		predictOptions = append(predictOptions, ggllm.SetPathPromptCache(p))
+// 	}
+
+// 	if c.Mirostat != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetMirostat(c.Mirostat))
+// 	}
+
+// 	if c.MirostatETA != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetMirostatETA(c.MirostatETA))
+// 	}
+
+// 	if c.MirostatTAU != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetMirostatTAU(c.MirostatTAU))
+// 	}
+
+// 	if c.Debug {
+// 		predictOptions = append(predictOptions, ggllm.Debug)
+// 	}
+
+// 	predictOptions = append(predictOptions, ggllm.SetStopWords(c.StopWords...))
+
+// 	if c.RepeatPenalty != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetPenalty(c.RepeatPenalty))
+// 	}
+
+// 	if c.Keep != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetNKeep(c.Keep))
+// 	}
+
+// 	if c.Batch != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetBatch(c.Batch))
+// 	}
+
+// 	if c.IgnoreEOS {
+// 		predictOptions = append(predictOptions, ggllm.IgnoreEOS)
+// 	}
+
+// 	if c.Seed != 0 {
+// 		predictOptions = append(predictOptions, ggllm.SetSeed(c.Seed))
+// 	}
+
+// 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+// 	predictOptions = append(predictOptions, ggllm.SetFrequencyPenalty(c.FrequencyPenalty))
+// 	predictOptions = append(predictOptions, ggllm.SetMlock(c.MMlock))
+// 	predictOptions = append(predictOptions, ggllm.SetMemoryMap(c.MMap))
+// 	predictOptions = append(predictOptions, ggllm.SetPredictionMainGPU(c.MainGPU))
+// 	predictOptions = append(predictOptions, ggllm.SetPredictionTensorSplit(c.TensorSplit))
+// 	predictOptions = append(predictOptions, ggllm.SetTailFreeSamplingZ(c.TFZ))
+// 	predictOptions = append(predictOptions, ggllm.SetTypicalP(c.TypicalP))
+
+// 	return predictOptions
+// }
+
 func defaultLLamaOpts(c Config) []llama.ModelOption {
 	llamaOpts := []llama.ModelOption{}
 	if c.ContextSize != 0 {
@ -59,118 +217,6 @@ func defaultLLamaOpts(c Config) []llama.ModelOption {
 	return llamaOpts
 }

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c Config, o *Option) (func() error, error) {
-	if c.Backend != model.StableDiffusionBackend {
-		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
-	}
-	inferenceModel, err := loader.BackendLoader(c.Backend, c.ImageGenerationAssets, []llama.ModelOption{}, uint32(c.Threads), o.assetsDestination)
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() error
-	switch model := inferenceModel.(type) {
-	case *stablediffusion.StableDiffusion:
-		fn = func() error {
-			return model.GenerateImage(height, width, mode, step, seed, positive_prompt, negative_prompt, dst)
-		}
-
-	default:
-		fn = func() error {
-			return fmt.Errorf("creation of images not supported by the backend")
-		}
-	}
-
-	return func() error {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[c.Backend]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[c.Backend] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
-}
-
-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config, o *Option) (func() ([]float32, error), error) {
-	if !c.Embeddings {
-		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
-	}
-
-	modelFile := c.Model
-
-	llamaOpts := defaultLLamaOpts(c)
-
-	var inferenceModel interface{}
-	var err error
-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() ([]float32, error)
-	switch model := inferenceModel.(type) {
-	case *llama.LLama:
-		fn = func() ([]float32, error) {
-			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, predictOptions...)
-			}
-			return model.Embeddings(s, predictOptions...)
-		}
-	// bert embeddings
-	case *bert.Bert:
-		fn = func() ([]float32, error) {
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, bert.SetThreads(c.Threads))
-			}
-			return model.Embeddings(s, bert.SetThreads(c.Threads))
-		}
-	default:
-		fn = func() ([]float32, error) {
-			return nil, fmt.Errorf("embeddings not supported by the backend")
-		}
-	}
-
-	return func() ([]float32, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		embeds, err := fn()
-		if err != nil {
-			return embeds, err
-		}
-		// Remove trailing 0s
-		for i := len(embeds) - 1; i >= 0; i-- {
-			if embeds[i] == 0.0 {
-				embeds = embeds[:i]
-			} else {
-				break
-			}
-		}
-		return embeds, nil
-	}, nil
-}
-
 func buildLLamaPredictOptions(c Config, modelPath string) []llama.PredictOption {
 	// Generate the prediction using the language model
 	predictOptions := []llama.PredictOption{
@ -253,18 +299,158 @@ func buildLLamaPredictOptions(c Config, modelPath string) []llama.PredictOption
 	return predictOptions
 }

+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c Config, o *Option) (func() error, error) {
+	if c.Backend != model.StableDiffusionBackend {
+		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
+	}
+
+	inferenceModel, err := loader.BackendLoader(
+		model.WithBackendString(c.Backend),
+		model.WithAssetDir(o.assetsDestination),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithModelFile(c.ImageGenerationAssets),
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	var fn func() error
+	switch model := inferenceModel.(type) {
+	case *stablediffusion.StableDiffusion:
+		fn = func() error {
+			return model.GenerateImage(height, width, mode, step, seed, positive_prompt, negative_prompt, dst)
+		}
+
+	default:
+		fn = func() error {
+			return fmt.Errorf("creation of images not supported by the backend")
+		}
+	}
+
+	return func() error {
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		mutexMap.Lock()
+		l, ok := mutexes[c.Backend]
+		if !ok {
+			m := &sync.Mutex{}
+			mutexes[c.Backend] = m
+			l = m
+		}
+		mutexMap.Unlock()
+		l.Lock()
+		defer l.Unlock()
+
+		return fn()
+	}, nil
+}
+
+func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config, o *Option) (func() ([]float32, error), error) {
+	if !c.Embeddings {
+		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
+	}
+
+	modelFile := c.Model
+
+	llamaOpts := defaultLLamaOpts(c)
+	grpcOpts := gRPCModelOpts(c)
+
+	var inferenceModel interface{}
+	var err error
+
+	opts := []model.Option{
+		model.WithLlamaOpts(llamaOpts...),
+		model.WithLoadGRPCOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.assetsDestination),
+		model.WithModelFile(modelFile),
+	}
+
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		opts = append(opts, model.WithBackendString(c.Backend))
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	var fn func() ([]float32, error)
+	switch model := inferenceModel.(type) {
+	case *llama.LLama:
+		fn = func() ([]float32, error) {
+			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
+			if len(tokens) > 0 {
+				return model.TokenEmbeddings(tokens, predictOptions...)
+			}
+			return model.Embeddings(s, predictOptions...)
+		}
+	// bert embeddings
+	case *bert.Bert:
+		fn = func() ([]float32, error) {
+			if len(tokens) > 0 {
+				return model.TokenEmbeddings(tokens, bert.SetThreads(c.Threads))
+			}
+			return model.Embeddings(s, bert.SetThreads(c.Threads))
+		}
+	default:
+		fn = func() ([]float32, error) {
+			return nil, fmt.Errorf("embeddings not supported by the backend")
+		}
+	}
+
+	return func() ([]float32, error) {
+		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
+		mutexMap.Lock()
+		l, ok := mutexes[modelFile]
+		if !ok {
+			m := &sync.Mutex{}
+			mutexes[modelFile] = m
+			l = m
+		}
+		mutexMap.Unlock()
+		l.Lock()
+		defer l.Unlock()
+
+		embeds, err := fn()
+		if err != nil {
+			return embeds, err
+		}
+		// Remove trailing 0s
+		for i := len(embeds) - 1; i >= 0; i-- {
+			if embeds[i] == 0.0 {
+				embeds = embeds[:i]
+			} else {
+				break
+			}
+		}
+		return embeds, nil
+	}, nil
+}
+
 func ModelInference(s string, loader *model.ModelLoader, c Config, o *Option, tokenCallback func(string) bool) (func() (string, error), error) {
 	supportStreams := false
 	modelFile := c.Model

 	llamaOpts := defaultLLamaOpts(c)
+	grpcOpts := gRPCModelOpts(c)

 	var inferenceModel interface{}
 	var err error
+
+	opts := []model.Option{
+		model.WithLlamaOpts(llamaOpts...),
+		model.WithLoadGRPCOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.assetsDestination),
+		model.WithModelFile(modelFile),
+	}
+
 	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
+		inferenceModel, err = loader.GreedyLoader(opts...)
 	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
+		opts = append(opts, model.WithBackendString(c.Backend))
+		inferenceModel, err = loader.BackendLoader(opts...)
 	}
 	if err != nil {
 		return nil, err
@ -552,6 +738,25 @@ func ModelInference(s string, loader *model.ModelLoader, c Config, o *Option, to
 			model.SetTokenCallback(nil)
 			return str, er
 		}
+	case *grpc.Client:
+		// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
+		supportStreams = true
+		fn = func() (string, error) {
+
+			opts := gRPCPredictOpts(c, loader.ModelPath)
+			opts.Prompt = s
+			if tokenCallback != nil {
+				ss := ""
+				err := model.PredictStream(context.TODO(), opts, func(s string) {
+					tokenCallback(s)
+					ss += s
+				})
+				return ss, err
+			} else {
+				reply, err := model.Predict(context.TODO(), opts)
+				return reply.Message, err
+			}
+		}
 	case *langchain.HuggingFace:
 		fn = func() (string, error) {