feat: move other backends to grpc

This finally makes everything more consistent Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-28 14:35:00 +00:00 · 2023-07-15 01:19:43 +02:00 · 2023-07-15 01:19:43 +02:00 · 1d0ed95a54
commit 1d0ed95a54
parent 5dcfdbe51d
54 changed files with 3171 additions and 1712 deletions
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@ -0,0 +1,42 @@
+package base
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
+)
+
+type Base struct {
+}
+
+func (llm *Base) Load(opts *pb.ModelOptions) error {
+	return fmt.Errorf("unimplemented")
+
+}
+
+func (llm *Base) Predict(opts *pb.PredictOptions) (string, error) {
+	return "", fmt.Errorf("unimplemented")
+}
+
+func (llm *Base) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	return fmt.Errorf("unimplemented")
+}
+
+func (llm *Base) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	return []float32{}, fmt.Errorf("unimplemented")
+}
+
+func (llm *Base) GenerateImage(*pb.GenerateImageRequest) error {
+	return fmt.Errorf("unimplemented")
+}
+
+func (llm *Base) AudioTranscription(*pb.TranscriptRequest) (api.Result, error) {
+	return api.Result{}, fmt.Errorf("unimplemented")
+}
+
+func (llm *Base) TTS(*pb.TTSRequest) error {
+	return fmt.Errorf("unimplemented")
+}
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@ -7,6 +7,7 @@ import (
 	"time"

 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
 )
@ -28,7 +29,7 @@ func (c *Client) HealthCheck(ctx context.Context) bool {
 		return false
 	}
 	defer conn.Close()
-	client := pb.NewLLMClient(conn)
+	client := pb.NewBackendClient(conn)

 	// The healthcheck call shouldn't take long time
 	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
@ -53,7 +54,7 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...
 		return nil, err
 	}
 	defer conn.Close()
-	client := pb.NewLLMClient(conn)
+	client := pb.NewBackendClient(conn)

 	return client.Embedding(ctx, in, opts...)
 }
@ -64,7 +65,7 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp
 		return nil, err
 	}
 	defer conn.Close()
-	client := pb.NewLLMClient(conn)
+	client := pb.NewBackendClient(conn)

 	return client.Predict(ctx, in, opts...)
 }
@ -75,7 +76,7 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 		return nil, err
 	}
 	defer conn.Close()
-	client := pb.NewLLMClient(conn)
+	client := pb.NewBackendClient(conn)
 	return client.LoadModel(ctx, in, opts...)
 }

@ -85,7 +86,7 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 		return err
 	}
 	defer conn.Close()
-	client := pb.NewLLMClient(conn)
+	client := pb.NewBackendClient(conn)

 	stream, err := client.PredictStream(ctx, in, opts...)
 	if err != nil {
@ -107,3 +108,53 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun

 	return nil
 }
+
+func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest, opts ...grpc.CallOption) (*pb.Result, error) {
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+	client := pb.NewBackendClient(conn)
+	return client.GenerateImage(ctx, in, opts...)
+}
+
+func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error) {
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+	client := pb.NewBackendClient(conn)
+	return client.TTS(ctx, in, opts...)
+}
+
+func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*api.Result, error) {
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	if err != nil {
+		return nil, err
+	}
+	defer conn.Close()
+	client := pb.NewBackendClient(conn)
+	res, err := client.AudioTranscription(ctx, in, opts...)
+	if err != nil {
+		return nil, err
+	}
+	tresult := &api.Result{}
+	for _, s := range res.Segments {
+		tks := []int{}
+		for _, t := range s.Tokens {
+			tks = append(tks, int(t))
+		}
+		tresult.Segments = append(tresult.Segments,
+			api.Segment{
+				Text:   s.Text,
+				Id:     int(s.Id),
+				Start:  time.Duration(s.Start),
+				End:    time.Duration(s.End),
+				Tokens: tks,
+			})
+	}
+	tresult.Text = res.Text
+	return tresult, err
+}
--- a/pkg/grpc/image/stablediffusion.go
+++ b/pkg/grpc/image/stablediffusion.go
@ -0,0 +1,33 @@
+package image
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
+)
+
+type StableDiffusion struct {
+	base.Base
+	stablediffusion *stablediffusion.StableDiffusion
+}
+
+func (sd *StableDiffusion) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	sd.stablediffusion, err = stablediffusion.New(opts.Model)
+	return err
+}
+
+func (sd *StableDiffusion) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return sd.stablediffusion.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Mode),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@ -2,11 +2,15 @@ package grpc

 import (
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 )

 type LLM interface {
 	Predict(*pb.PredictOptions) (string, error)
-	PredictStream(*pb.PredictOptions, chan string)
+	PredictStream(*pb.PredictOptions, chan string) error
 	Load(*pb.ModelOptions) error
 	Embeddings(*pb.PredictOptions) ([]float32, error)
+	GenerateImage(*pb.GenerateImageRequest) error
+	AudioTranscription(*pb.TranscriptRequest) (api.Result, error)
+	TTS(*pb.TTSRequest) error
 }
--- a/pkg/grpc/llm/bert/bert.go
+++ b/pkg/grpc/llm/bert/bert.go
@ -0,0 +1,33 @@
+package bert
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	bert "github.com/go-skynet/go-bert.cpp"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+)
+
+type Embeddings struct {
+	base.Base
+	bert *bert.Bert
+}
+
+func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
+	model, err := bert.New(opts.Model)
+	llm.bert = model
+	return err
+}
+
+func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
+	}
+
+	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
+}
--- a/pkg/grpc/llm/bloomz/bloomz.go
+++ b/pkg/grpc/llm/bloomz/bloomz.go
@ -0,0 +1,59 @@
+package bloomz
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/bloomz.cpp"
+)
+
+type LLM struct {
+	base.Base
+
+	bloomz *bloomz.Bloomz
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	model, err := bloomz.New(opts.Model)
+	llm.bloomz = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []bloomz.PredictOption {
+	predictOptions := []bloomz.PredictOption{
+		bloomz.SetTemperature(float64(opts.Temperature)),
+		bloomz.SetTopP(float64(opts.TopP)),
+		bloomz.SetTopK(int(opts.TopK)),
+		bloomz.SetTokens(int(opts.Tokens)),
+		bloomz.SetThreads(int(opts.Threads)),
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, bloomz.SetSeed(int(opts.Seed)))
+	}
+
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+// fallback to Predict
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	go func() {
+		res, err := llm.bloomz.Predict(opts.Prompt, buildPredictOptions(opts)...)
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		results <- res
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/grpc/llm/falcon/falcon.go
+++ b/pkg/grpc/llm/falcon/falcon.go
@ -5,12 +5,15 @@ package falcon
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	ggllm "github.com/mudler/go-ggllm.cpp"
 )

 type LLM struct {
+	base.Base
+
 	falcon *ggllm.Falcon
 }

@ -42,10 +45,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func buildPredictOptions(opts *pb.PredictOptions) []ggllm.PredictOption {
 	predictOptions := []ggllm.PredictOption{
 		ggllm.SetTemperature(float64(opts.Temperature)),
@ -122,7 +121,7 @@ func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)

 	predictOptions = append(predictOptions, ggllm.SetTokenCallback(func(token string) bool {
@ -140,4 +139,6 @@ func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
 		}
 		close(results)
 	}()
+
+	return nil
 }
--- a/pkg/grpc/llm/gpt4all/gpt4all.go
+++ b/pkg/grpc/llm/gpt4all/gpt4all.go
@ -5,11 +5,14 @@ package gpt4all
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
 )

 type LLM struct {
+	base.Base
+
 	gpt4all *gpt4all.Model
 }

@ -39,7 +42,7 @@ func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)

 	go func() {
@ -54,8 +57,6 @@ func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
 		llm.gpt4all.SetTokenCallback(nil)
 		close(results)
 	}()
-}

-func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return []float32{}, fmt.Errorf("not implemented")
+	return nil
 }
--- a/pkg/grpc/llm/langchain/langchain.go
+++ b/pkg/grpc/llm/langchain/langchain.go
@ -0,0 +1,58 @@
+package langchain
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/langchain"
+)
+
+type LLM struct {
+	base.Base
+
+	langchain *langchain.HuggingFace
+	model     string
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	llm.langchain, _ = langchain.NewHuggingFace(opts.Model)
+	llm.model = opts.Model
+	return nil
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	o := []langchain.PredictOption{
+		langchain.SetModel(llm.model),
+		langchain.SetMaxTokens(int(opts.Tokens)),
+		langchain.SetTemperature(float64(opts.Temperature)),
+		langchain.SetStopWords(opts.StopPrompts),
+	}
+	pred, err := llm.langchain.PredictHuggingFace(opts.Prompt, o...)
+	if err != nil {
+		return "", err
+	}
+	return pred.Completion, nil
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	o := []langchain.PredictOption{
+		langchain.SetModel(llm.model),
+		langchain.SetMaxTokens(int(opts.Tokens)),
+		langchain.SetTemperature(float64(opts.Temperature)),
+		langchain.SetStopWords(opts.StopPrompts),
+	}
+	go func() {
+		res, err := llm.langchain.PredictHuggingFace(opts.Prompt, o...)
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		results <- res.Completion
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/grpc/llm/llama/llama.go
+++ b/pkg/grpc/llm/llama/llama.go
@ -5,11 +5,14 @@ package llama
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/go-llama.cpp"
 )

 type LLM struct {
+	base.Base
+
 	llama *llama.LLama
 }

@ -133,7 +136,7 @@ func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)

 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
@ -148,6 +151,8 @@ func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) {
 		}
 		close(results)
 	}()
+
+	return nil
 }

 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
--- a/pkg/grpc/llm/rwkv/rwkv.go
+++ b/pkg/grpc/llm/rwkv/rwkv.go
@ -0,0 +1,71 @@
+package rwkv
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+	"path/filepath"
+
+	"github.com/donomii/go-rwkv.cpp"
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+)
+
+const tokenizerSuffix = ".tokenizer.json"
+
+type LLM struct {
+	base.Base
+
+	rwkv *rwkv.RwkvState
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	modelPath := filepath.Dir(opts.Model)
+	modelFile := filepath.Base(opts.Model)
+	model := rwkv.LoadFiles(opts.Model, filepath.Join(modelPath, modelFile+tokenizerSuffix), uint32(opts.GetThreads()))
+
+	if model == nil {
+		return fmt.Errorf("could not load model")
+	}
+	llm.rwkv = model
+	return nil
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+
+	stopWord := "\n"
+	if len(opts.StopPrompts) > 0 {
+		stopWord = opts.StopPrompts[0]
+	}
+
+	if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
+		return "", err
+	}
+
+	response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
+
+	return response, nil
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	go func() {
+
+		stopWord := "\n"
+		if len(opts.StopPrompts) > 0 {
+			stopWord = opts.StopPrompts[0]
+		}
+
+		if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
+			fmt.Println("Error processing input: ", err)
+			return
+		}
+
+		llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
+			results <- s
+			return true
+		})
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/grpc/llm/transformers/dolly.go
+++ b/pkg/grpc/llm/transformers/dolly.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type Dolly struct {
+	base.Base
+
 	dolly *transformers.Dolly
 }

@ -20,16 +23,12 @@ func (llm *Dolly) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *Dolly) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *Dolly) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.dolly.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,6 @@ func (llm *Dolly) PredictStream(opts *pb.PredictOptions, results chan string) {
 		results <- res
 		close(results)
 	}()
+
+	return nil
 }
--- a/pkg/grpc/llm/transformers/falcon.go
+++ b/pkg/grpc/llm/transformers/falcon.go
@ -0,0 +1,43 @@
+package transformers
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
+)
+
+type Falcon struct {
+	base.Base
+
+	falcon *transformers.Falcon
+}
+
+func (llm *Falcon) Load(opts *pb.ModelOptions) error {
+	model, err := transformers.NewFalcon(opts.Model)
+	llm.falcon = model
+	return err
+}
+
+func (llm *Falcon) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+// fallback to Predict
+func (llm *Falcon) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	go func() {
+		res, err := llm.falcon.Predict(opts.Prompt, buildPredictOptions(opts)...)
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		results <- res
+		close(results)
+	}()
+
+	return nil
+}
--- a/pkg/grpc/llm/transformers/gpt2.go
+++ b/pkg/grpc/llm/transformers/gpt2.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type GPT2 struct {
+	base.Base
+
 	gpt2 *transformers.GPT2
 }

@ -20,16 +23,12 @@ func (llm *GPT2) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *GPT2) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *GPT2) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *GPT2) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *GPT2) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gpt2.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,5 @@ func (llm *GPT2) PredictStream(opts *pb.PredictOptions, results chan string) {
 		results <- res
 		close(results)
 	}()
+	return nil
 }
--- a/pkg/grpc/llm/transformers/gptj.go
+++ b/pkg/grpc/llm/transformers/gptj.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type GPTJ struct {
+	base.Base
+
 	gptj *transformers.GPTJ
 }

@ -20,16 +23,12 @@ func (llm *GPTJ) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *GPTJ) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *GPTJ) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *GPTJ) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *GPTJ) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gptj.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,5 @@ func (llm *GPTJ) PredictStream(opts *pb.PredictOptions, results chan string) {
 		results <- res
 		close(results)
 	}()
+	return nil
 }
--- a/pkg/grpc/llm/transformers/gptneox.go
+++ b/pkg/grpc/llm/transformers/gptneox.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type GPTNeoX struct {
+	base.Base
+
 	gptneox *transformers.GPTNeoX
 }

@ -20,16 +23,12 @@ func (llm *GPTNeoX) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *GPTNeoX) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *GPTNeoX) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *GPTNeoX) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *GPTNeoX) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.gptneox.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,5 @@ func (llm *GPTNeoX) PredictStream(opts *pb.PredictOptions, results chan string)
 		results <- res
 		close(results)
 	}()
+	return nil
 }
--- a/pkg/grpc/llm/transformers/mpt.go
+++ b/pkg/grpc/llm/transformers/mpt.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type MPT struct {
+	base.Base
+
 	mpt *transformers.MPT
 }

@ -20,16 +23,12 @@ func (llm *MPT) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *MPT) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *MPT) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *MPT) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *MPT) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.mpt.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,5 @@ func (llm *MPT) PredictStream(opts *pb.PredictOptions, results chan string) {
 		results <- res
 		close(results)
 	}()
+	return nil
 }
--- a/pkg/grpc/llm/transformers/replit.go
+++ b/pkg/grpc/llm/transformers/replit.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type Replit struct {
+	base.Base
+
 	replit *transformers.Replit
 }

@ -20,16 +23,12 @@ func (llm *Replit) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *Replit) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *Replit) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *Replit) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *Replit) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.replit.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,5 @@ func (llm *Replit) PredictStream(opts *pb.PredictOptions, results chan string) {
 		results <- res
 		close(results)
 	}()
+	return nil
 }
--- a/pkg/grpc/llm/transformers/starcoder.go
+++ b/pkg/grpc/llm/transformers/starcoder.go
@ -5,12 +5,15 @@ package transformers
 import (
 	"fmt"

+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"

 	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
 )

 type Starcoder struct {
+	base.Base
+
 	starcoder *transformers.Starcoder
 }

@ -20,16 +23,12 @@ func (llm *Starcoder) Load(opts *pb.ModelOptions) error {
 	return err
 }

-func (llm *Starcoder) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	return nil, fmt.Errorf("not implemented")
-}
-
 func (llm *Starcoder) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }

 // fallback to Predict
-func (llm *Starcoder) PredictStream(opts *pb.PredictOptions, results chan string) {
+func (llm *Starcoder) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		res, err := llm.starcoder.Predict(opts.Prompt, buildPredictOptions(opts)...)

@ -39,4 +38,6 @@ func (llm *Starcoder) PredictStream(opts *pb.PredictOptions, results chan string
 		results <- res
 		close(results)
 	}()
+
+	return nil
 }
--- a/pkg/grpc/proto/backend.pb.go
+++ b/pkg/grpc/proto/backend.pb.go
--- a/pkg/grpc/proto/llmserver.proto
+++ b/pkg/grpc/proto/llmserver.proto
@ -2,17 +2,20 @@ syntax = "proto3";

 option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto";
 option java_multiple_files = true;
-option java_package = "io.skynet.localai.llmserver";
-option java_outer_classname = "LLMServer";
+option java_package = "io.skynet.localai.backend";
+option java_outer_classname = "LocalAIBackend";

-package llm;
+package backend;

-service LLM {
+service Backend {
  rpc Health(HealthMessage) returns (Reply) {}
  rpc Predict(PredictOptions) returns (Reply) {}
  rpc LoadModel(ModelOptions) returns (Result) {}
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
+  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
+  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
+  rpc TTS(TTSRequest) returns (Result) {}
 }

 message HealthMessage {}
@ -87,4 +90,40 @@ message Result {

 message EmbeddingResult {
  repeated float embeddings = 1;
-}
+}
+
+message TranscriptRequest {
+  string dst = 2;
+  string language = 3;
+  uint32 threads = 4;
+}
+
+message TranscriptResult {
+  repeated TranscriptSegment segments = 1;
+  string text = 2;
+}
+
+message TranscriptSegment {
+  int32 id = 1;
+  int64 start = 2;
+  int64 end = 3;
+  string text = 4;
+  repeated int32 tokens = 5;
+}
+
+message GenerateImageRequest {
+  int32 height = 1;
+  int32 width = 2;
+  int32 mode = 3;
+  int32 step = 4;
+  int32 seed = 5;
+  string positive_prompt = 6;
+  string negative_prompt = 7;
+  string dst = 8;
+}
+
+message TTSRequest {
+  string text = 1;
+  string model = 2;
+  string dst = 3;
+}
--- a/pkg/grpc/proto/backend_grpc.pb.go
+++ b/pkg/grpc/proto/backend_grpc.pb.go
@ -0,0 +1,385 @@
+// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
+// versions:
+// - protoc-gen-go-grpc v1.2.0
+// - protoc             v3.15.8
+// source: pkg/grpc/proto/backend.proto
+
+package proto
+
+import (
+	context "context"
+	grpc "google.golang.org/grpc"
+	codes "google.golang.org/grpc/codes"
+	status "google.golang.org/grpc/status"
+)
+
+// This is a compile-time assertion to ensure that this generated file
+// is compatible with the grpc package it is being compiled against.
+// Requires gRPC-Go v1.32.0 or later.
+const _ = grpc.SupportPackageIsVersion7
+
+// BackendClient is the client API for Backend service.
+//
+// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
+type BackendClient interface {
+	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
+	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
+	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
+	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
+	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
+	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
+	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
+	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
+}
+
+type backendClient struct {
+	cc grpc.ClientConnInterface
+}
+
+func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
+	return &backendClient{cc}
+}
+
+func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
+	out := new(Reply)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
+	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
+	if err != nil {
+		return nil, err
+	}
+	x := &backendPredictStreamClient{stream}
+	if err := x.ClientStream.SendMsg(in); err != nil {
+		return nil, err
+	}
+	if err := x.ClientStream.CloseSend(); err != nil {
+		return nil, err
+	}
+	return x, nil
+}
+
+type Backend_PredictStreamClient interface {
+	Recv() (*Reply, error)
+	grpc.ClientStream
+}
+
+type backendPredictStreamClient struct {
+	grpc.ClientStream
+}
+
+func (x *backendPredictStreamClient) Recv() (*Reply, error) {
+	m := new(Reply)
+	if err := x.ClientStream.RecvMsg(m); err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
+	out := new(EmbeddingResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
+	out := new(TranscriptResult)
+	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
+	out := new(Result)
+	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return out, nil
+}
+
+// BackendServer is the server API for Backend service.
+// All implementations must embed UnimplementedBackendServer
+// for forward compatibility
+type BackendServer interface {
+	Health(context.Context, *HealthMessage) (*Reply, error)
+	Predict(context.Context, *PredictOptions) (*Reply, error)
+	LoadModel(context.Context, *ModelOptions) (*Result, error)
+	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
+	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
+	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
+	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
+	TTS(context.Context, *TTSRequest) (*Result, error)
+	mustEmbedUnimplementedBackendServer()
+}
+
+// UnimplementedBackendServer must be embedded to have forward compatible implementations.
+type UnimplementedBackendServer struct {
+}
+
+func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
+}
+func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
+}
+func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
+}
+func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
+	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
+}
+func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
+}
+func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
+}
+func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
+}
+func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
+	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
+}
+func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
+
+// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
+// Use of this interface is not recommended, as added methods to BackendServer will
+// result in compilation errors.
+type UnsafeBackendServer interface {
+	mustEmbedUnimplementedBackendServer()
+}
+
+func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
+	s.RegisterService(&Backend_ServiceDesc, srv)
+}
+
+func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(HealthMessage)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Health(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Health",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Predict(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Predict",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(ModelOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).LoadModel(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/LoadModel",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
+	m := new(PredictOptions)
+	if err := stream.RecvMsg(m); err != nil {
+		return err
+	}
+	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
+}
+
+type Backend_PredictStreamServer interface {
+	Send(*Reply) error
+	grpc.ServerStream
+}
+
+type backendPredictStreamServer struct {
+	grpc.ServerStream
+}
+
+func (x *backendPredictStreamServer) Send(m *Reply) error {
+	return x.ServerStream.SendMsg(m)
+}
+
+func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(PredictOptions)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).Embedding(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/Embedding",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(GenerateImageRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).GenerateImage(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/GenerateImage",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TranscriptRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).AudioTranscription(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/AudioTranscription",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
+	in := new(TTSRequest)
+	if err := dec(in); err != nil {
+		return nil, err
+	}
+	if interceptor == nil {
+		return srv.(BackendServer).TTS(ctx, in)
+	}
+	info := &grpc.UnaryServerInfo{
+		Server:     srv,
+		FullMethod: "/backend.Backend/TTS",
+	}
+	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
+		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
+	}
+	return interceptor(ctx, in, info, handler)
+}
+
+// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
+// It's only intended for direct use with grpc.RegisterService,
+// and not to be introspected or modified (even as a copy)
+var Backend_ServiceDesc = grpc.ServiceDesc{
+	ServiceName: "backend.Backend",
+	HandlerType: (*BackendServer)(nil),
+	Methods: []grpc.MethodDesc{
+		{
+			MethodName: "Health",
+			Handler:    _Backend_Health_Handler,
+		},
+		{
+			MethodName: "Predict",
+			Handler:    _Backend_Predict_Handler,
+		},
+		{
+			MethodName: "LoadModel",
+			Handler:    _Backend_LoadModel_Handler,
+		},
+		{
+			MethodName: "Embedding",
+			Handler:    _Backend_Embedding_Handler,
+		},
+		{
+			MethodName: "GenerateImage",
+			Handler:    _Backend_GenerateImage_Handler,
+		},
+		{
+			MethodName: "AudioTranscription",
+			Handler:    _Backend_AudioTranscription_Handler,
+		},
+		{
+			MethodName: "TTS",
+			Handler:    _Backend_TTS_Handler,
+		},
+	},
+	Streams: []grpc.StreamDesc{
+		{
+			StreamName:    "PredictStream",
+			Handler:       _Backend_PredictStream_Handler,
+			ServerStreams: true,
+		},
+	},
+	Metadata: "pkg/grpc/proto/backend.proto",
+}
--- a/pkg/grpc/proto/llmserver.pb.go
+++ b/pkg/grpc/proto/llmserver.pb.go
@ -1,969 +0,0 @@
-// Code generated by protoc-gen-go. DO NOT EDIT.
-// versions:
-// 	protoc-gen-go v1.26.0
-// 	protoc        v3.15.8
-// source: pkg/grpc/proto/llmserver.proto
-
-package proto
-
-import (
-	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
-	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
-	reflect "reflect"
-	sync "sync"
-)
-
-const (
-	// Verify that this generated code is sufficiently up-to-date.
-	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
-	// Verify that runtime/protoimpl is sufficiently up-to-date.
-	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
-)
-
-type HealthMessage struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-}
-
-func (x *HealthMessage) Reset() {
-	*x = HealthMessage{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[0]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *HealthMessage) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*HealthMessage) ProtoMessage() {}
-
-func (x *HealthMessage) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[0]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use HealthMessage.ProtoReflect.Descriptor instead.
-func (*HealthMessage) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{0}
-}
-
-// The request message containing the user's name.
-type PredictOptions struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-
-	Prompt            string   `protobuf:"bytes,1,opt,name=Prompt,proto3" json:"Prompt,omitempty"`
-	Seed              int32    `protobuf:"varint,2,opt,name=Seed,proto3" json:"Seed,omitempty"`
-	Threads           int32    `protobuf:"varint,3,opt,name=Threads,proto3" json:"Threads,omitempty"`
-	Tokens            int32    `protobuf:"varint,4,opt,name=Tokens,proto3" json:"Tokens,omitempty"`
-	TopK              int32    `protobuf:"varint,5,opt,name=TopK,proto3" json:"TopK,omitempty"`
-	Repeat            int32    `protobuf:"varint,6,opt,name=Repeat,proto3" json:"Repeat,omitempty"`
-	Batch             int32    `protobuf:"varint,7,opt,name=Batch,proto3" json:"Batch,omitempty"`
-	NKeep             int32    `protobuf:"varint,8,opt,name=NKeep,proto3" json:"NKeep,omitempty"`
-	Temperature       float32  `protobuf:"fixed32,9,opt,name=Temperature,proto3" json:"Temperature,omitempty"`
-	Penalty           float32  `protobuf:"fixed32,10,opt,name=Penalty,proto3" json:"Penalty,omitempty"`
-	F16KV             bool     `protobuf:"varint,11,opt,name=F16KV,proto3" json:"F16KV,omitempty"`
-	DebugMode         bool     `protobuf:"varint,12,opt,name=DebugMode,proto3" json:"DebugMode,omitempty"`
-	StopPrompts       []string `protobuf:"bytes,13,rep,name=StopPrompts,proto3" json:"StopPrompts,omitempty"`
-	IgnoreEOS         bool     `protobuf:"varint,14,opt,name=IgnoreEOS,proto3" json:"IgnoreEOS,omitempty"`
-	TailFreeSamplingZ float32  `protobuf:"fixed32,15,opt,name=TailFreeSamplingZ,proto3" json:"TailFreeSamplingZ,omitempty"`
-	TypicalP          float32  `protobuf:"fixed32,16,opt,name=TypicalP,proto3" json:"TypicalP,omitempty"`
-	FrequencyPenalty  float32  `protobuf:"fixed32,17,opt,name=FrequencyPenalty,proto3" json:"FrequencyPenalty,omitempty"`
-	PresencePenalty   float32  `protobuf:"fixed32,18,opt,name=PresencePenalty,proto3" json:"PresencePenalty,omitempty"`
-	Mirostat          int32    `protobuf:"varint,19,opt,name=Mirostat,proto3" json:"Mirostat,omitempty"`
-	MirostatETA       float32  `protobuf:"fixed32,20,opt,name=MirostatETA,proto3" json:"MirostatETA,omitempty"`
-	MirostatTAU       float32  `protobuf:"fixed32,21,opt,name=MirostatTAU,proto3" json:"MirostatTAU,omitempty"`
-	PenalizeNL        bool     `protobuf:"varint,22,opt,name=PenalizeNL,proto3" json:"PenalizeNL,omitempty"`
-	LogitBias         string   `protobuf:"bytes,23,opt,name=LogitBias,proto3" json:"LogitBias,omitempty"`
-	MLock             bool     `protobuf:"varint,25,opt,name=MLock,proto3" json:"MLock,omitempty"`
-	MMap              bool     `protobuf:"varint,26,opt,name=MMap,proto3" json:"MMap,omitempty"`
-	PromptCacheAll    bool     `protobuf:"varint,27,opt,name=PromptCacheAll,proto3" json:"PromptCacheAll,omitempty"`
-	PromptCacheRO     bool     `protobuf:"varint,28,opt,name=PromptCacheRO,proto3" json:"PromptCacheRO,omitempty"`
-	Grammar           string   `protobuf:"bytes,29,opt,name=Grammar,proto3" json:"Grammar,omitempty"`
-	MainGPU           string   `protobuf:"bytes,30,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
-	TensorSplit       string   `protobuf:"bytes,31,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
-	TopP              float32  `protobuf:"fixed32,32,opt,name=TopP,proto3" json:"TopP,omitempty"`
-	PromptCachePath   string   `protobuf:"bytes,33,opt,name=PromptCachePath,proto3" json:"PromptCachePath,omitempty"`
-	Debug             bool     `protobuf:"varint,34,opt,name=Debug,proto3" json:"Debug,omitempty"`
-	EmbeddingTokens   []int32  `protobuf:"varint,35,rep,packed,name=EmbeddingTokens,proto3" json:"EmbeddingTokens,omitempty"`
-	Embeddings        string   `protobuf:"bytes,36,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
-}
-
-func (x *PredictOptions) Reset() {
-	*x = PredictOptions{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[1]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *PredictOptions) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*PredictOptions) ProtoMessage() {}
-
-func (x *PredictOptions) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[1]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use PredictOptions.ProtoReflect.Descriptor instead.
-func (*PredictOptions) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{1}
-}
-
-func (x *PredictOptions) GetPrompt() string {
-	if x != nil {
-		return x.Prompt
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetSeed() int32 {
-	if x != nil {
-		return x.Seed
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetThreads() int32 {
-	if x != nil {
-		return x.Threads
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetTokens() int32 {
-	if x != nil {
-		return x.Tokens
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetTopK() int32 {
-	if x != nil {
-		return x.TopK
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetRepeat() int32 {
-	if x != nil {
-		return x.Repeat
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetBatch() int32 {
-	if x != nil {
-		return x.Batch
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetNKeep() int32 {
-	if x != nil {
-		return x.NKeep
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetTemperature() float32 {
-	if x != nil {
-		return x.Temperature
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetPenalty() float32 {
-	if x != nil {
-		return x.Penalty
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetF16KV() bool {
-	if x != nil {
-		return x.F16KV
-	}
-	return false
-}
-
-func (x *PredictOptions) GetDebugMode() bool {
-	if x != nil {
-		return x.DebugMode
-	}
-	return false
-}
-
-func (x *PredictOptions) GetStopPrompts() []string {
-	if x != nil {
-		return x.StopPrompts
-	}
-	return nil
-}
-
-func (x *PredictOptions) GetIgnoreEOS() bool {
-	if x != nil {
-		return x.IgnoreEOS
-	}
-	return false
-}
-
-func (x *PredictOptions) GetTailFreeSamplingZ() float32 {
-	if x != nil {
-		return x.TailFreeSamplingZ
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetTypicalP() float32 {
-	if x != nil {
-		return x.TypicalP
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetFrequencyPenalty() float32 {
-	if x != nil {
-		return x.FrequencyPenalty
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetPresencePenalty() float32 {
-	if x != nil {
-		return x.PresencePenalty
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetMirostat() int32 {
-	if x != nil {
-		return x.Mirostat
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetMirostatETA() float32 {
-	if x != nil {
-		return x.MirostatETA
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetMirostatTAU() float32 {
-	if x != nil {
-		return x.MirostatTAU
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetPenalizeNL() bool {
-	if x != nil {
-		return x.PenalizeNL
-	}
-	return false
-}
-
-func (x *PredictOptions) GetLogitBias() string {
-	if x != nil {
-		return x.LogitBias
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetMLock() bool {
-	if x != nil {
-		return x.MLock
-	}
-	return false
-}
-
-func (x *PredictOptions) GetMMap() bool {
-	if x != nil {
-		return x.MMap
-	}
-	return false
-}
-
-func (x *PredictOptions) GetPromptCacheAll() bool {
-	if x != nil {
-		return x.PromptCacheAll
-	}
-	return false
-}
-
-func (x *PredictOptions) GetPromptCacheRO() bool {
-	if x != nil {
-		return x.PromptCacheRO
-	}
-	return false
-}
-
-func (x *PredictOptions) GetGrammar() string {
-	if x != nil {
-		return x.Grammar
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetMainGPU() string {
-	if x != nil {
-		return x.MainGPU
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetTensorSplit() string {
-	if x != nil {
-		return x.TensorSplit
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetTopP() float32 {
-	if x != nil {
-		return x.TopP
-	}
-	return 0
-}
-
-func (x *PredictOptions) GetPromptCachePath() string {
-	if x != nil {
-		return x.PromptCachePath
-	}
-	return ""
-}
-
-func (x *PredictOptions) GetDebug() bool {
-	if x != nil {
-		return x.Debug
-	}
-	return false
-}
-
-func (x *PredictOptions) GetEmbeddingTokens() []int32 {
-	if x != nil {
-		return x.EmbeddingTokens
-	}
-	return nil
-}
-
-func (x *PredictOptions) GetEmbeddings() string {
-	if x != nil {
-		return x.Embeddings
-	}
-	return ""
-}
-
-// The response message containing the result
-type Reply struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-
-	Message string `protobuf:"bytes,1,opt,name=message,proto3" json:"message,omitempty"`
-}
-
-func (x *Reply) Reset() {
-	*x = Reply{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[2]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *Reply) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*Reply) ProtoMessage() {}
-
-func (x *Reply) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[2]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use Reply.ProtoReflect.Descriptor instead.
-func (*Reply) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{2}
-}
-
-func (x *Reply) GetMessage() string {
-	if x != nil {
-		return x.Message
-	}
-	return ""
-}
-
-type ModelOptions struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-
-	Model             string `protobuf:"bytes,1,opt,name=Model,proto3" json:"Model,omitempty"`
-	ContextSize       int32  `protobuf:"varint,2,opt,name=ContextSize,proto3" json:"ContextSize,omitempty"`
-	Seed              int32  `protobuf:"varint,3,opt,name=Seed,proto3" json:"Seed,omitempty"`
-	NBatch            int32  `protobuf:"varint,4,opt,name=NBatch,proto3" json:"NBatch,omitempty"`
-	F16Memory         bool   `protobuf:"varint,5,opt,name=F16Memory,proto3" json:"F16Memory,omitempty"`
-	MLock             bool   `protobuf:"varint,6,opt,name=MLock,proto3" json:"MLock,omitempty"`
-	MMap              bool   `protobuf:"varint,7,opt,name=MMap,proto3" json:"MMap,omitempty"`
-	VocabOnly         bool   `protobuf:"varint,8,opt,name=VocabOnly,proto3" json:"VocabOnly,omitempty"`
-	LowVRAM           bool   `protobuf:"varint,9,opt,name=LowVRAM,proto3" json:"LowVRAM,omitempty"`
-	Embeddings        bool   `protobuf:"varint,10,opt,name=Embeddings,proto3" json:"Embeddings,omitempty"`
-	NUMA              bool   `protobuf:"varint,11,opt,name=NUMA,proto3" json:"NUMA,omitempty"`
-	NGPULayers        int32  `protobuf:"varint,12,opt,name=NGPULayers,proto3" json:"NGPULayers,omitempty"`
-	MainGPU           string `protobuf:"bytes,13,opt,name=MainGPU,proto3" json:"MainGPU,omitempty"`
-	TensorSplit       string `protobuf:"bytes,14,opt,name=TensorSplit,proto3" json:"TensorSplit,omitempty"`
-	Threads           int32  `protobuf:"varint,15,opt,name=Threads,proto3" json:"Threads,omitempty"`
-	LibrarySearchPath string `protobuf:"bytes,16,opt,name=LibrarySearchPath,proto3" json:"LibrarySearchPath,omitempty"`
-}
-
-func (x *ModelOptions) Reset() {
-	*x = ModelOptions{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[3]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *ModelOptions) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*ModelOptions) ProtoMessage() {}
-
-func (x *ModelOptions) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[3]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use ModelOptions.ProtoReflect.Descriptor instead.
-func (*ModelOptions) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{3}
-}
-
-func (x *ModelOptions) GetModel() string {
-	if x != nil {
-		return x.Model
-	}
-	return ""
-}
-
-func (x *ModelOptions) GetContextSize() int32 {
-	if x != nil {
-		return x.ContextSize
-	}
-	return 0
-}
-
-func (x *ModelOptions) GetSeed() int32 {
-	if x != nil {
-		return x.Seed
-	}
-	return 0
-}
-
-func (x *ModelOptions) GetNBatch() int32 {
-	if x != nil {
-		return x.NBatch
-	}
-	return 0
-}
-
-func (x *ModelOptions) GetF16Memory() bool {
-	if x != nil {
-		return x.F16Memory
-	}
-	return false
-}
-
-func (x *ModelOptions) GetMLock() bool {
-	if x != nil {
-		return x.MLock
-	}
-	return false
-}
-
-func (x *ModelOptions) GetMMap() bool {
-	if x != nil {
-		return x.MMap
-	}
-	return false
-}
-
-func (x *ModelOptions) GetVocabOnly() bool {
-	if x != nil {
-		return x.VocabOnly
-	}
-	return false
-}
-
-func (x *ModelOptions) GetLowVRAM() bool {
-	if x != nil {
-		return x.LowVRAM
-	}
-	return false
-}
-
-func (x *ModelOptions) GetEmbeddings() bool {
-	if x != nil {
-		return x.Embeddings
-	}
-	return false
-}
-
-func (x *ModelOptions) GetNUMA() bool {
-	if x != nil {
-		return x.NUMA
-	}
-	return false
-}
-
-func (x *ModelOptions) GetNGPULayers() int32 {
-	if x != nil {
-		return x.NGPULayers
-	}
-	return 0
-}
-
-func (x *ModelOptions) GetMainGPU() string {
-	if x != nil {
-		return x.MainGPU
-	}
-	return ""
-}
-
-func (x *ModelOptions) GetTensorSplit() string {
-	if x != nil {
-		return x.TensorSplit
-	}
-	return ""
-}
-
-func (x *ModelOptions) GetThreads() int32 {
-	if x != nil {
-		return x.Threads
-	}
-	return 0
-}
-
-func (x *ModelOptions) GetLibrarySearchPath() string {
-	if x != nil {
-		return x.LibrarySearchPath
-	}
-	return ""
-}
-
-type Result struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-
-	Message string `protobuf:"bytes,1,opt,name=message,proto3" json:"message,omitempty"`
-	Success bool   `protobuf:"varint,2,opt,name=success,proto3" json:"success,omitempty"`
-}
-
-func (x *Result) Reset() {
-	*x = Result{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[4]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *Result) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*Result) ProtoMessage() {}
-
-func (x *Result) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[4]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use Result.ProtoReflect.Descriptor instead.
-func (*Result) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{4}
-}
-
-func (x *Result) GetMessage() string {
-	if x != nil {
-		return x.Message
-	}
-	return ""
-}
-
-func (x *Result) GetSuccess() bool {
-	if x != nil {
-		return x.Success
-	}
-	return false
-}
-
-type EmbeddingResult struct {
-	state         protoimpl.MessageState
-	sizeCache     protoimpl.SizeCache
-	unknownFields protoimpl.UnknownFields
-
-	Embeddings []float32 `protobuf:"fixed32,1,rep,packed,name=embeddings,proto3" json:"embeddings,omitempty"`
-}
-
-func (x *EmbeddingResult) Reset() {
-	*x = EmbeddingResult{}
-	if protoimpl.UnsafeEnabled {
-		mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[5]
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		ms.StoreMessageInfo(mi)
-	}
-}
-
-func (x *EmbeddingResult) String() string {
-	return protoimpl.X.MessageStringOf(x)
-}
-
-func (*EmbeddingResult) ProtoMessage() {}
-
-func (x *EmbeddingResult) ProtoReflect() protoreflect.Message {
-	mi := &file_pkg_grpc_proto_llmserver_proto_msgTypes[5]
-	if protoimpl.UnsafeEnabled && x != nil {
-		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
-		if ms.LoadMessageInfo() == nil {
-			ms.StoreMessageInfo(mi)
-		}
-		return ms
-	}
-	return mi.MessageOf(x)
-}
-
-// Deprecated: Use EmbeddingResult.ProtoReflect.Descriptor instead.
-func (*EmbeddingResult) Descriptor() ([]byte, []int) {
-	return file_pkg_grpc_proto_llmserver_proto_rawDescGZIP(), []int{5}
-}
-
-func (x *EmbeddingResult) GetEmbeddings() []float32 {
-	if x != nil {
-		return x.Embeddings
-	}
-	return nil
-}
-
-var File_pkg_grpc_proto_llmserver_proto protoreflect.FileDescriptor
-
-var file_pkg_grpc_proto_llmserver_proto_rawDesc = []byte{
-	0x0a, 0x1e, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-	0x2f, 0x6c, 0x6c, 0x6d, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f,
-	0x12, 0x03, 0x6c, 0x6c, 0x6d, 0x22, 0x0f, 0x0a, 0x0d, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x4d,
-	0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0xa0, 0x08, 0x0a, 0x0e, 0x50, 0x72, 0x65, 0x64, 0x69,
-	0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x50, 0x72, 0x6f,
-	0x6d, 0x70, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x50, 0x72, 0x6f, 0x6d, 0x70,
-	0x74, 0x12, 0x12, 0x0a, 0x04, 0x53, 0x65, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52,
-	0x04, 0x53, 0x65, 0x65, 0x64, 0x12, 0x18, 0x0a, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73,
-	0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12,
-	0x16, 0x0a, 0x06, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52,
-	0x06, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x54, 0x6f, 0x70, 0x4b, 0x18,
-	0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x04, 0x54, 0x6f, 0x70, 0x4b, 0x12, 0x16, 0x0a, 0x06, 0x52,
-	0x65, 0x70, 0x65, 0x61, 0x74, 0x18, 0x06, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x52, 0x65, 0x70,
-	0x65, 0x61, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x42, 0x61, 0x74, 0x63, 0x68, 0x18, 0x07, 0x20, 0x01,
-	0x28, 0x05, 0x52, 0x05, 0x42, 0x61, 0x74, 0x63, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x4e, 0x4b, 0x65,
-	0x65, 0x70, 0x18, 0x08, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x4e, 0x4b, 0x65, 0x65, 0x70, 0x12,
-	0x20, 0x0a, 0x0b, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x18, 0x09,
-	0x20, 0x01, 0x28, 0x02, 0x52, 0x0b, 0x54, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72,
-	0x65, 0x12, 0x18, 0x0a, 0x07, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x18, 0x0a, 0x20, 0x01,
-	0x28, 0x02, 0x52, 0x07, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x46,
-	0x31, 0x36, 0x4b, 0x56, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x46, 0x31, 0x36, 0x4b,
-	0x56, 0x12, 0x1c, 0x0a, 0x09, 0x44, 0x65, 0x62, 0x75, 0x67, 0x4d, 0x6f, 0x64, 0x65, 0x18, 0x0c,
-	0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x44, 0x65, 0x62, 0x75, 0x67, 0x4d, 0x6f, 0x64, 0x65, 0x12,
-	0x20, 0x0a, 0x0b, 0x53, 0x74, 0x6f, 0x70, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x73, 0x18, 0x0d,
-	0x20, 0x03, 0x28, 0x09, 0x52, 0x0b, 0x53, 0x74, 0x6f, 0x70, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-	0x73, 0x12, 0x1c, 0x0a, 0x09, 0x49, 0x67, 0x6e, 0x6f, 0x72, 0x65, 0x45, 0x4f, 0x53, 0x18, 0x0e,
-	0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x49, 0x67, 0x6e, 0x6f, 0x72, 0x65, 0x45, 0x4f, 0x53, 0x12,
-	0x2c, 0x0a, 0x11, 0x54, 0x61, 0x69, 0x6c, 0x46, 0x72, 0x65, 0x65, 0x53, 0x61, 0x6d, 0x70, 0x6c,
-	0x69, 0x6e, 0x67, 0x5a, 0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x52, 0x11, 0x54, 0x61, 0x69, 0x6c,
-	0x46, 0x72, 0x65, 0x65, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x5a, 0x12, 0x1a, 0x0a,
-	0x08, 0x54, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x50, 0x18, 0x10, 0x20, 0x01, 0x28, 0x02, 0x52,
-	0x08, 0x54, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x50, 0x12, 0x2a, 0x0a, 0x10, 0x46, 0x72, 0x65,
-	0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x18, 0x11, 0x20,
-	0x01, 0x28, 0x02, 0x52, 0x10, 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x50, 0x65,
-	0x6e, 0x61, 0x6c, 0x74, 0x79, 0x12, 0x28, 0x0a, 0x0f, 0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63,
-	0x65, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x18, 0x12, 0x20, 0x01, 0x28, 0x02, 0x52, 0x0f,
-	0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x12,
-	0x1a, 0x0a, 0x08, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x18, 0x13, 0x20, 0x01, 0x28,
-	0x05, 0x52, 0x08, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x12, 0x20, 0x0a, 0x0b, 0x4d,
-	0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x45, 0x54, 0x41, 0x18, 0x14, 0x20, 0x01, 0x28, 0x02,
-	0x52, 0x0b, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x45, 0x54, 0x41, 0x12, 0x20, 0x0a,
-	0x0b, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x54, 0x41, 0x55, 0x18, 0x15, 0x20, 0x01,
-	0x28, 0x02, 0x52, 0x0b, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x54, 0x41, 0x55, 0x12,
-	0x1e, 0x0a, 0x0a, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x4e, 0x4c, 0x18, 0x16, 0x20,
-	0x01, 0x28, 0x08, 0x52, 0x0a, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x4e, 0x4c, 0x12,
-	0x1c, 0x0a, 0x09, 0x4c, 0x6f, 0x67, 0x69, 0x74, 0x42, 0x69, 0x61, 0x73, 0x18, 0x17, 0x20, 0x01,
-	0x28, 0x09, 0x52, 0x09, 0x4c, 0x6f, 0x67, 0x69, 0x74, 0x42, 0x69, 0x61, 0x73, 0x12, 0x14, 0x0a,
-	0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x18, 0x19, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x4d, 0x4c,
-	0x6f, 0x63, 0x6b, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x18, 0x1a, 0x20, 0x01, 0x28,
-	0x08, 0x52, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x12, 0x26, 0x0a, 0x0e, 0x50, 0x72, 0x6f, 0x6d, 0x70,
-	0x74, 0x43, 0x61, 0x63, 0x68, 0x65, 0x41, 0x6c, 0x6c, 0x18, 0x1b, 0x20, 0x01, 0x28, 0x08, 0x52,
-	0x0e, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x61, 0x63, 0x68, 0x65, 0x41, 0x6c, 0x6c, 0x12,
-	0x24, 0x0a, 0x0d, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x61, 0x63, 0x68, 0x65, 0x52, 0x4f,
-	0x18, 0x1c, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0d, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x61,
-	0x63, 0x68, 0x65, 0x52, 0x4f, 0x12, 0x18, 0x0a, 0x07, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72,
-	0x18, 0x1d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x12,
-	0x18, 0x0a, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47, 0x50, 0x55, 0x18, 0x1e, 0x20, 0x01, 0x28, 0x09,
-	0x52, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47, 0x50, 0x55, 0x12, 0x20, 0x0a, 0x0b, 0x54, 0x65, 0x6e,
-	0x73, 0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x18, 0x1f, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b,
-	0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x54,
-	0x6f, 0x70, 0x50, 0x18, 0x20, 0x20, 0x01, 0x28, 0x02, 0x52, 0x04, 0x54, 0x6f, 0x70, 0x50, 0x12,
-	0x28, 0x0a, 0x0f, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x61, 0x63, 0x68, 0x65, 0x50, 0x61,
-	0x74, 0x68, 0x18, 0x21, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-	0x43, 0x61, 0x63, 0x68, 0x65, 0x50, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x44, 0x65, 0x62,
-	0x75, 0x67, 0x18, 0x22, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x44, 0x65, 0x62, 0x75, 0x67, 0x12,
-	0x28, 0x0a, 0x0f, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x54, 0x6f, 0x6b, 0x65,
-	0x6e, 0x73, 0x18, 0x23, 0x20, 0x03, 0x28, 0x05, 0x52, 0x0f, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64,
-	0x69, 0x6e, 0x67, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x12, 0x1e, 0x0a, 0x0a, 0x45, 0x6d, 0x62,
-	0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x45,
-	0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x21, 0x0a, 0x05, 0x52, 0x65, 0x70,
-	0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01, 0x20,
-	0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0xca, 0x03, 0x0a,
-	0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x14, 0x0a,
-	0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x4d, 0x6f,
-	0x64, 0x65, 0x6c, 0x12, 0x20, 0x0a, 0x0b, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x53, 0x69,
-	0x7a, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78,
-	0x74, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x53, 0x65, 0x65, 0x64, 0x18, 0x03, 0x20,
-	0x01, 0x28, 0x05, 0x52, 0x04, 0x53, 0x65, 0x65, 0x64, 0x12, 0x16, 0x0a, 0x06, 0x4e, 0x42, 0x61,
-	0x74, 0x63, 0x68, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x06, 0x4e, 0x42, 0x61, 0x74, 0x63,
-	0x68, 0x12, 0x1c, 0x0a, 0x09, 0x46, 0x31, 0x36, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x05,
-	0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x46, 0x31, 0x36, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12,
-	0x14, 0x0a, 0x05, 0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05,
-	0x4d, 0x4c, 0x6f, 0x63, 0x6b, 0x12, 0x12, 0x0a, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x18, 0x07, 0x20,
-	0x01, 0x28, 0x08, 0x52, 0x04, 0x4d, 0x4d, 0x61, 0x70, 0x12, 0x1c, 0x0a, 0x09, 0x56, 0x6f, 0x63,
-	0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x18, 0x08, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x56, 0x6f,
-	0x63, 0x61, 0x62, 0x4f, 0x6e, 0x6c, 0x79, 0x12, 0x18, 0x0a, 0x07, 0x4c, 0x6f, 0x77, 0x56, 0x52,
-	0x41, 0x4d, 0x18, 0x09, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x4c, 0x6f, 0x77, 0x56, 0x52, 0x41,
-	0x4d, 0x12, 0x1e, 0x0a, 0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x18,
-	0x0a, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67,
-	0x73, 0x12, 0x12, 0x0a, 0x04, 0x4e, 0x55, 0x4d, 0x41, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x08, 0x52,
-	0x04, 0x4e, 0x55, 0x4d, 0x41, 0x12, 0x1e, 0x0a, 0x0a, 0x4e, 0x47, 0x50, 0x55, 0x4c, 0x61, 0x79,
-	0x65, 0x72, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x4e, 0x47, 0x50, 0x55, 0x4c,
-	0x61, 0x79, 0x65, 0x72, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47, 0x50, 0x55,
-	0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x4d, 0x61, 0x69, 0x6e, 0x47, 0x50, 0x55, 0x12,
-	0x20, 0x0a, 0x0b, 0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x18, 0x0e,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x54, 0x65, 0x6e, 0x73, 0x6f, 0x72, 0x53, 0x70, 0x6c, 0x69,
-	0x74, 0x12, 0x18, 0x0a, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x18, 0x0f, 0x20, 0x01,
-	0x28, 0x05, 0x52, 0x07, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12, 0x2c, 0x0a, 0x11, 0x4c,
-	0x69, 0x62, 0x72, 0x61, 0x72, 0x79, 0x53, 0x65, 0x61, 0x72, 0x63, 0x68, 0x50, 0x61, 0x74, 0x68,
-	0x18, 0x10, 0x20, 0x01, 0x28, 0x09, 0x52, 0x11, 0x4c, 0x69, 0x62, 0x72, 0x61, 0x72, 0x79, 0x53,
-	0x65, 0x61, 0x72, 0x63, 0x68, 0x50, 0x61, 0x74, 0x68, 0x22, 0x3c, 0x0a, 0x06, 0x52, 0x65, 0x73,
-	0x75, 0x6c, 0x74, 0x12, 0x18, 0x0a, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x18, 0x01,
-	0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x12, 0x18, 0x0a,
-	0x07, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07,
-	0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x22, 0x31, 0x0a, 0x0f, 0x45, 0x6d, 0x62, 0x65, 0x64,
-	0x64, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x12, 0x1e, 0x0a, 0x0a, 0x65, 0x6d,
-	0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x02, 0x52, 0x0a,
-	0x65, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x73, 0x32, 0xfe, 0x01, 0x0a, 0x03, 0x4c,
-	0x4c, 0x4d, 0x12, 0x2a, 0x0a, 0x06, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x12, 0x12, 0x2e, 0x6c,
-	0x6c, 0x6d, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-	0x1a, 0x0a, 0x2e, 0x6c, 0x6c, 0x6d, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x2c,
-	0x0a, 0x07, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x12, 0x13, 0x2e, 0x6c, 0x6c, 0x6d, 0x2e,
-	0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0a,
-	0x2e, 0x6c, 0x6c, 0x6d, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x2d, 0x0a, 0x09,
-	0x4c, 0x6f, 0x61, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x11, 0x2e, 0x6c, 0x6c, 0x6d, 0x2e,
-	0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x1a, 0x0b, 0x2e, 0x6c,
-	0x6c, 0x6d, 0x2e, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x12, 0x34, 0x0a, 0x0d, 0x50,
-	0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x53, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x12, 0x13, 0x2e, 0x6c,
-	0x6c, 0x6d, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e,
-	0x73, 0x1a, 0x0a, 0x2e, 0x6c, 0x6c, 0x6d, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x30,
-	0x01, 0x12, 0x38, 0x0a, 0x09, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x12, 0x13,
-	0x2e, 0x6c, 0x6c, 0x6d, 0x2e, 0x50, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x4f, 0x70, 0x74, 0x69,
-	0x6f, 0x6e, 0x73, 0x1a, 0x14, 0x2e, 0x6c, 0x6c, 0x6d, 0x2e, 0x45, 0x6d, 0x62, 0x65, 0x64, 0x64,
-	0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x22, 0x00, 0x42, 0x57, 0x0a, 0x1b, 0x69,
-	0x6f, 0x2e, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x61, 0x69,
-	0x2e, 0x6c, 0x6c, 0x6d, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x42, 0x09, 0x4c, 0x4c, 0x4d, 0x53,
-	0x65, 0x72, 0x76, 0x65, 0x72, 0x50, 0x01, 0x5a, 0x2b, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e,
-	0x63, 0x6f, 0x6d, 0x2f, 0x67, 0x6f, 0x2d, 0x73, 0x6b, 0x79, 0x6e, 0x65, 0x74, 0x2f, 0x4c, 0x6f,
-	0x63, 0x61, 0x6c, 0x41, 0x49, 0x2f, 0x70, 0x6b, 0x67, 0x2f, 0x67, 0x72, 0x70, 0x63, 0x2f, 0x70,
-	0x72, 0x6f, 0x74, 0x6f, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33,
-}
-
-var (
-	file_pkg_grpc_proto_llmserver_proto_rawDescOnce sync.Once
-	file_pkg_grpc_proto_llmserver_proto_rawDescData = file_pkg_grpc_proto_llmserver_proto_rawDesc
-)
-
-func file_pkg_grpc_proto_llmserver_proto_rawDescGZIP() []byte {
-	file_pkg_grpc_proto_llmserver_proto_rawDescOnce.Do(func() {
-		file_pkg_grpc_proto_llmserver_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_grpc_proto_llmserver_proto_rawDescData)
-	})
-	return file_pkg_grpc_proto_llmserver_proto_rawDescData
-}
-
-var file_pkg_grpc_proto_llmserver_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
-var file_pkg_grpc_proto_llmserver_proto_goTypes = []interface{}{
-	(*HealthMessage)(nil),   // 0: llm.HealthMessage
-	(*PredictOptions)(nil),  // 1: llm.PredictOptions
-	(*Reply)(nil),           // 2: llm.Reply
-	(*ModelOptions)(nil),    // 3: llm.ModelOptions
-	(*Result)(nil),          // 4: llm.Result
-	(*EmbeddingResult)(nil), // 5: llm.EmbeddingResult
-}
-var file_pkg_grpc_proto_llmserver_proto_depIdxs = []int32{
-	0, // 0: llm.LLM.Health:input_type -> llm.HealthMessage
-	1, // 1: llm.LLM.Predict:input_type -> llm.PredictOptions
-	3, // 2: llm.LLM.LoadModel:input_type -> llm.ModelOptions
-	1, // 3: llm.LLM.PredictStream:input_type -> llm.PredictOptions
-	1, // 4: llm.LLM.Embedding:input_type -> llm.PredictOptions
-	2, // 5: llm.LLM.Health:output_type -> llm.Reply
-	2, // 6: llm.LLM.Predict:output_type -> llm.Reply
-	4, // 7: llm.LLM.LoadModel:output_type -> llm.Result
-	2, // 8: llm.LLM.PredictStream:output_type -> llm.Reply
-	5, // 9: llm.LLM.Embedding:output_type -> llm.EmbeddingResult
-	5, // [5:10] is the sub-list for method output_type
-	0, // [0:5] is the sub-list for method input_type
-	0, // [0:0] is the sub-list for extension type_name
-	0, // [0:0] is the sub-list for extension extendee
-	0, // [0:0] is the sub-list for field type_name
-}
-
-func init() { file_pkg_grpc_proto_llmserver_proto_init() }
-func file_pkg_grpc_proto_llmserver_proto_init() {
-	if File_pkg_grpc_proto_llmserver_proto != nil {
-		return
-	}
-	if !protoimpl.UnsafeEnabled {
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*HealthMessage); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*PredictOptions); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*Reply); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*ModelOptions); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*Result); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-		file_pkg_grpc_proto_llmserver_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
-			switch v := v.(*EmbeddingResult); i {
-			case 0:
-				return &v.state
-			case 1:
-				return &v.sizeCache
-			case 2:
-				return &v.unknownFields
-			default:
-				return nil
-			}
-		}
-	}
-	type x struct{}
-	out := protoimpl.TypeBuilder{
-		File: protoimpl.DescBuilder{
-			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
-			RawDescriptor: file_pkg_grpc_proto_llmserver_proto_rawDesc,
-			NumEnums:      0,
-			NumMessages:   6,
-			NumExtensions: 0,
-			NumServices:   1,
-		},
-		GoTypes:           file_pkg_grpc_proto_llmserver_proto_goTypes,
-		DependencyIndexes: file_pkg_grpc_proto_llmserver_proto_depIdxs,
-		MessageInfos:      file_pkg_grpc_proto_llmserver_proto_msgTypes,
-	}.Build()
-	File_pkg_grpc_proto_llmserver_proto = out.File
-	file_pkg_grpc_proto_llmserver_proto_rawDesc = nil
-	file_pkg_grpc_proto_llmserver_proto_goTypes = nil
-	file_pkg_grpc_proto_llmserver_proto_depIdxs = nil
-}
--- a/pkg/grpc/proto/llmserver_grpc.pb.go
+++ b/pkg/grpc/proto/llmserver_grpc.pb.go
@ -1,277 +0,0 @@
-// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
-// versions:
-// - protoc-gen-go-grpc v1.2.0
-// - protoc             v3.15.8
-// source: pkg/grpc/proto/llmserver.proto
-
-package proto
-
-import (
-	context "context"
-	grpc "google.golang.org/grpc"
-	codes "google.golang.org/grpc/codes"
-	status "google.golang.org/grpc/status"
-)
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the grpc package it is being compiled against.
-// Requires gRPC-Go v1.32.0 or later.
-const _ = grpc.SupportPackageIsVersion7
-
-// LLMClient is the client API for LLM service.
-//
-// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
-type LLMClient interface {
-	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
-	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
-	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
-	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (LLM_PredictStreamClient, error)
-	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
-}
-
-type lLMClient struct {
-	cc grpc.ClientConnInterface
-}
-
-func NewLLMClient(cc grpc.ClientConnInterface) LLMClient {
-	return &lLMClient{cc}
-}
-
-func (c *lLMClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/llm.LLM/Health", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *lLMClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/llm.LLM/Predict", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *lLMClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/llm.LLM/LoadModel", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *lLMClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (LLM_PredictStreamClient, error) {
-	stream, err := c.cc.NewStream(ctx, &LLM_ServiceDesc.Streams[0], "/llm.LLM/PredictStream", opts...)
-	if err != nil {
-		return nil, err
-	}
-	x := &lLMPredictStreamClient{stream}
-	if err := x.ClientStream.SendMsg(in); err != nil {
-		return nil, err
-	}
-	if err := x.ClientStream.CloseSend(); err != nil {
-		return nil, err
-	}
-	return x, nil
-}
-
-type LLM_PredictStreamClient interface {
-	Recv() (*Reply, error)
-	grpc.ClientStream
-}
-
-type lLMPredictStreamClient struct {
-	grpc.ClientStream
-}
-
-func (x *lLMPredictStreamClient) Recv() (*Reply, error) {
-	m := new(Reply)
-	if err := x.ClientStream.RecvMsg(m); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-func (c *lLMClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
-	out := new(EmbeddingResult)
-	err := c.cc.Invoke(ctx, "/llm.LLM/Embedding", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-// LLMServer is the server API for LLM service.
-// All implementations must embed UnimplementedLLMServer
-// for forward compatibility
-type LLMServer interface {
-	Health(context.Context, *HealthMessage) (*Reply, error)
-	Predict(context.Context, *PredictOptions) (*Reply, error)
-	LoadModel(context.Context, *ModelOptions) (*Result, error)
-	PredictStream(*PredictOptions, LLM_PredictStreamServer) error
-	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
-	mustEmbedUnimplementedLLMServer()
-}
-
-// UnimplementedLLMServer must be embedded to have forward compatible implementations.
-type UnimplementedLLMServer struct {
-}
-
-func (UnimplementedLLMServer) Health(context.Context, *HealthMessage) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
-}
-func (UnimplementedLLMServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
-}
-func (UnimplementedLLMServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
-}
-func (UnimplementedLLMServer) PredictStream(*PredictOptions, LLM_PredictStreamServer) error {
-	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
-}
-func (UnimplementedLLMServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
-}
-func (UnimplementedLLMServer) mustEmbedUnimplementedLLMServer() {}
-
-// UnsafeLLMServer may be embedded to opt out of forward compatibility for this service.
-// Use of this interface is not recommended, as added methods to LLMServer will
-// result in compilation errors.
-type UnsafeLLMServer interface {
-	mustEmbedUnimplementedLLMServer()
-}
-
-func RegisterLLMServer(s grpc.ServiceRegistrar, srv LLMServer) {
-	s.RegisterService(&LLM_ServiceDesc, srv)
-}
-
-func _LLM_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(LLMServer).Health(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/llm.LLM/Health",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(LLMServer).Health(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _LLM_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(LLMServer).Predict(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/llm.LLM/Predict",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(LLMServer).Predict(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _LLM_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(ModelOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(LLMServer).LoadModel(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/llm.LLM/LoadModel",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(LLMServer).LoadModel(ctx, req.(*ModelOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _LLM_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
-	m := new(PredictOptions)
-	if err := stream.RecvMsg(m); err != nil {
-		return err
-	}
-	return srv.(LLMServer).PredictStream(m, &lLMPredictStreamServer{stream})
-}
-
-type LLM_PredictStreamServer interface {
-	Send(*Reply) error
-	grpc.ServerStream
-}
-
-type lLMPredictStreamServer struct {
-	grpc.ServerStream
-}
-
-func (x *lLMPredictStreamServer) Send(m *Reply) error {
-	return x.ServerStream.SendMsg(m)
-}
-
-func _LLM_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(LLMServer).Embedding(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/llm.LLM/Embedding",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(LLMServer).Embedding(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-// LLM_ServiceDesc is the grpc.ServiceDesc for LLM service.
-// It's only intended for direct use with grpc.RegisterService,
-// and not to be introspected or modified (even as a copy)
-var LLM_ServiceDesc = grpc.ServiceDesc{
-	ServiceName: "llm.LLM",
-	HandlerType: (*LLMServer)(nil),
-	Methods: []grpc.MethodDesc{
-		{
-			MethodName: "Health",
-			Handler:    _LLM_Health_Handler,
-		},
-		{
-			MethodName: "Predict",
-			Handler:    _LLM_Predict_Handler,
-		},
-		{
-			MethodName: "LoadModel",
-			Handler:    _LLM_LoadModel_Handler,
-		},
-		{
-			MethodName: "Embedding",
-			Handler:    _LLM_Embedding_Handler,
-		},
-	},
-	Streams: []grpc.StreamDesc{
-		{
-			StreamName:    "PredictStream",
-			Handler:       _LLM_PredictStream_Handler,
-			ServerStreams: true,
-		},
-	},
-	Metadata: "pkg/grpc/proto/llmserver.proto",
-}
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@ -21,7 +21,7 @@ import (

 // server is used to implement helloworld.GreeterServer.
 type server struct {
-	pb.UnimplementedLLMServer
+	pb.UnimplementedBackendServer
 	llm LLM
 }

@ -51,7 +51,48 @@ func (s *server) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.Reply,
 	return &pb.Reply{Message: result}, err
 }

-func (s *server) PredictStream(in *pb.PredictOptions, stream pb.LLM_PredictStreamServer) error {
+func (s *server) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest) (*pb.Result, error) {
+	err := s.llm.GenerateImage(in)
+	if err != nil {
+		return &pb.Result{Message: fmt.Sprintf("Error generating image: %s", err.Error()), Success: false}, err
+	}
+	return &pb.Result{Message: "Image generated", Success: true}, nil
+}
+
+func (s *server) TTS(ctx context.Context, in *pb.TTSRequest) (*pb.Result, error) {
+	err := s.llm.TTS(in)
+	if err != nil {
+		return &pb.Result{Message: fmt.Sprintf("Error generating audio: %s", err.Error()), Success: false}, err
+	}
+	return &pb.Result{Message: "Audio generated", Success: true}, nil
+}
+
+func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest) (*pb.TranscriptResult, error) {
+	result, err := s.llm.AudioTranscription(in)
+	if err != nil {
+		return nil, err
+	}
+	tresult := &pb.TranscriptResult{}
+	for _, s := range result.Segments {
+		tks := []int32{}
+		for _, t := range s.Tokens {
+			tks = append(tks, int32(t))
+		}
+		tresult.Segments = append(tresult.Segments,
+			&pb.TranscriptSegment{
+				Text:   s.Text,
+				Id:     int32(s.Id),
+				Start:  int64(s.Start),
+				End:    int64(s.End),
+				Tokens: tks,
+			})
+	}
+
+	tresult.Text = result.Text
+	return tresult, nil
+}
+
+func (s *server) PredictStream(in *pb.PredictOptions, stream pb.Backend_PredictStreamServer) error {

 	resultChan := make(chan string)

@ -75,7 +116,7 @@ func StartServer(address string, model LLM) error {
 		return err
 	}
 	s := grpc.NewServer()
-	pb.RegisterLLMServer(s, &server{llm: model})
+	pb.RegisterBackendServer(s, &server{llm: model})
 	log.Printf("gRPC Server listening at %v", lis.Addr())
 	if err := s.Serve(lis); err != nil {
 		return err
--- a/pkg/grpc/transcribe/whisper.go
+++ b/pkg/grpc/transcribe/whisper.go
@ -0,0 +1,27 @@
+package transcribe
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	whisperutil "github.com/go-skynet/LocalAI/pkg/grpc/whisper"
+	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
+)
+
+type Whisper struct {
+	base.Base
+	whisper whisper.Model
+}
+
+func (sd *Whisper) Load(opts *pb.ModelOptions) error {
+	// Note: the Model here is a path to a directory containing the model files
+	w, err := whisper.New(opts.Model)
+	sd.whisper = w
+	return err
+}
+
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (api.Result, error) {
+	return whisperutil.Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
+}
--- a/pkg/grpc/tts/piper.go
+++ b/pkg/grpc/tts/piper.go
@ -0,0 +1,44 @@
+package tts
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"os"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	piper "github.com/mudler/go-piper"
+)
+
+type Piper struct {
+	base.Base
+	piper *PiperB
+}
+
+func (sd *Piper) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	sd.piper, err = New(opts.LibrarySearchPath)
+	return err
+}
+
+func (sd *Piper) TTS(opts *pb.TTSRequest) error {
+	return sd.piper.TTS(opts.Text, opts.Model, opts.Dst)
+}
+
+type PiperB struct {
+	assetDir string
+}
+
+func New(assetDir string) (*PiperB, error) {
+	if _, err := os.Stat(assetDir); err != nil {
+		return nil, err
+	}
+	return &PiperB{
+		assetDir: assetDir,
+	}, nil
+}
+
+func (s *PiperB) TTS(text, model, dst string) error {
+	return piper.TextToWav(text, model, s.assetDir, "", dst)
+}
--- a/pkg/grpc/whisper/api/api.go
+++ b/pkg/grpc/whisper/api/api.go
@ -0,0 +1,16 @@
+package api
+
+import "time"
+
+type Segment struct {
+	Id     int           `json:"id"`
+	Start  time.Duration `json:"start"`
+	End    time.Duration `json:"end"`
+	Text   string        `json:"text"`
+	Tokens []int         `json:"tokens"`
+}
+
+type Result struct {
+	Segments []Segment `json:"segments"`
+	Text     string    `json:"text"`
+}
--- a/pkg/grpc/whisper/whisper.go
+++ b/pkg/grpc/whisper/whisper.go
@ -5,25 +5,12 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
-	"time"

 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	wav "github.com/go-audio/wav"
+	"github.com/go-skynet/LocalAI/pkg/grpc/whisper/api"
 )

-type Segment struct {
-	Id int               `json:"id"`
-	Start time.Duration  `json:"start"`
-	End time.Duration    `json:"end"`
-	Text string          `json:"text"`
-	Tokens []int         `json:"tokens"`
-}
-
-type Result struct {
-	Segments []Segment  `json:"segments"`
-	Text string         `json:"text"`
-}
-
 func sh(c string) (string, error) {
 	cmd := exec.Command("/bin/sh", "-c", c)
 	cmd.Env = os.Environ()
@ -42,8 +29,8 @@ func audioToWav(src, dst string) error {
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (Result, error) {
-	res := Result{}
+func Transcript(model whisper.Model, audiopath, language string, threads uint) (api.Result, error) {
+	res := api.Result{}

 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
@ -99,11 +86,11 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
 		}

 		var tokens []int
-		for _, t := range(s.Tokens) {
+		for _, t := range s.Tokens {
 			tokens = append(tokens, t.Id)
 		}

-		segment := Segment{Id: s.Num, Text: s.Text, Start:s.Start, End: s.End, Tokens: tokens}
+		segment := api.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
 		res.Segments = append(res.Segments, segment)

 		res.Text += s.Text
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -4,18 +4,13 @@ import (
 	"context"
 	"fmt"
 	"os"
+	"os/signal"
 	"path/filepath"
 	"strings"
+	"syscall"
 	"time"

-	rwkv "github.com/donomii/go-rwkv.cpp"
-	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
-	"github.com/go-skynet/LocalAI/pkg/langchain"
-	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
-	"github.com/go-skynet/LocalAI/pkg/tts"
-	bloomz "github.com/go-skynet/bloomz.cpp"
-	bert "github.com/go-skynet/go-bert.cpp"
 	"github.com/hashicorp/go-multierror"
 	"github.com/hpcloud/tail"
 	"github.com/phayes/freeport"
@ -27,20 +22,22 @@ import (
 const tokenizerSuffix = ".tokenizer.json"

 const (
-	LlamaBackend           = "llama"
-	BloomzBackend          = "bloomz"
-	StarcoderBackend       = "starcoder"
-	GPTJBackend            = "gptj"
-	DollyBackend           = "dolly"
-	MPTBackend             = "mpt"
-	GPTNeoXBackend         = "gptneox"
-	ReplitBackend          = "replit"
-	Gpt2Backend            = "gpt2"
-	Gpt4AllLlamaBackend    = "gpt4all-llama"
-	Gpt4AllMptBackend      = "gpt4all-mpt"
-	Gpt4AllJBackend        = "gpt4all-j"
-	Gpt4All                = "gpt4all"
-	FalconBackend          = "falcon"
+	LlamaBackend        = "llama"
+	BloomzBackend       = "bloomz"
+	StarcoderBackend    = "starcoder"
+	GPTJBackend         = "gptj"
+	DollyBackend        = "dolly"
+	MPTBackend          = "mpt"
+	GPTNeoXBackend      = "gptneox"
+	ReplitBackend       = "replit"
+	Gpt2Backend         = "gpt2"
+	Gpt4AllLlamaBackend = "gpt4all-llama"
+	Gpt4AllMptBackend   = "gpt4all-mpt"
+	Gpt4AllJBackend     = "gpt4all-j"
+	Gpt4All             = "gpt4all"
+	FalconBackend       = "falcon"
+	FalconGGMLBackend   = "falcon-ggml"
+
 	BertEmbeddingsBackend  = "bert-embeddings"
 	RwkvBackend            = "rwkv"
 	WhisperBackend         = "whisper"
@ -54,77 +51,39 @@ var autoLoadBackends []string = []string{
 	LlamaBackend,
 	Gpt4All,
 	RwkvBackend,
+	FalconBackend,
 	WhisperBackend,
-	BertEmbeddingsBackend,
 	GPTNeoXBackend,
+	BertEmbeddingsBackend,
+	FalconGGMLBackend,
 	GPTJBackend,
 	Gpt2Backend,
 	DollyBackend,
 	MPTBackend,
 	ReplitBackend,
 	StarcoderBackend,
-	FalconBackend,
 	BloomzBackend,
 }

-var bertEmbeddings = func(modelFile string) (interface{}, error) {
-	return bert.New(modelFile)
-}
-
-var bloomzLM = func(modelFile string) (interface{}, error) {
-	return bloomz.New(modelFile)
-}
-
-var stableDiffusion = func(assetDir string) (interface{}, error) {
-	return stablediffusion.New(assetDir)
-}
-
-func piperTTS(assetDir string) func(s string) (interface{}, error) {
-	return func(s string) (interface{}, error) {
-		return tts.New(assetDir)
-	}
-}
-
-var whisperModel = func(modelFile string) (interface{}, error) {
-	return whisper.New(modelFile)
-}
-
-var lcHuggingFace = func(repoId string) (interface{}, error) {
-	return langchain.NewHuggingFace(repoId)
-}
-
-// func llamaLM(opts ...llama.ModelOption) func(string) (interface{}, error) {
-// 	return func(s string) (interface{}, error) {
-// 		return llama.New(s, opts...)
-// 	}
-// }
-
-// func gpt4allLM(opts ...gpt4all.ModelOption) func(string) (interface{}, error) {
-// 	return func(s string) (interface{}, error) {
-// 		return gpt4all.New(s, opts...)
-// 	}
-// }
-
-func rwkvLM(tokenFile string, threads uint32) func(string) (interface{}, error) {
-	return func(s string) (interface{}, error) {
-		log.Debug().Msgf("Loading RWKV", s, tokenFile)
-
-		model := rwkv.LoadFiles(s, tokenFile, threads)
-		if model == nil {
-			return nil, fmt.Errorf("could not load model")
-		}
-		return model, nil
+func (ml *ModelLoader) StopGRPC() {
+	for _, p := range ml.grpcProcesses {
+		p.Stop()
 	}
 }

 // starts the grpcModelProcess for the backend, and returns a grpc client
 // It also loads the model
-func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (interface{}, error) {
-	return func(s string) (interface{}, error) {
+func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (*grpc.Client, error) {
+	return func(s string) (*grpc.Client, error) {
 		log.Debug().Msgf("Loading GRPC Model", backend, *o)

 		grpcProcess := filepath.Join(o.assetDir, "backend-assets", "grpc", backend)

+		// Check if the file exists
+		if _, err := os.Stat(grpcProcess); os.IsNotExist(err) {
+			return nil, fmt.Errorf("grpc process not found: %s. some backends(stablediffusion, tts) require LocalAI compiled with GO_TAGS", grpcProcess)
+		}
+
 		// Make sure the process is executable
 		if err := os.Chmod(grpcProcess, 0755); err != nil {
 			return nil, err
@ -151,6 +110,14 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (inter
 			return nil, err
 		}

+		// clean up process
+		go func() {
+			c := make(chan os.Signal, 1)
+			signal.Notify(c, os.Interrupt, syscall.SIGTERM)
+			<-c
+			grpcControlProcess.Stop()
+		}()
+
 		go func() {
 			t, err := tail.TailFile(grpcControlProcess.StderrPath(), tail.Config{Follow: true})
 			if err != nil {
@ -200,7 +167,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (inter

 		log.Debug().Msgf("GRPC: Loading model with options: %+v", options)

-		res, err := client.LoadModel(context.TODO(), &options)
+		res, err := client.LoadModel(o.context, &options)
 		if err != nil {
 			return nil, err
 		}
@ -212,63 +179,37 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string) (inter
 	}
 }

-func (ml *ModelLoader) BackendLoader(opts ...Option) (model interface{}, err error) {
-
-	//backendString string, modelFile string, llamaOpts []llama.ModelOption, threads uint32, assetDir string) (model interface{}, err error) {
-
+func (ml *ModelLoader) BackendLoader(opts ...Option) (model *grpc.Client, err error) {
 	o := NewOptions(opts...)

 	log.Debug().Msgf("Loading model %s from %s", o.backendString, o.modelFile)
-	switch strings.ToLower(o.backendString) {
-	case LlamaBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(LlamaBackend, o))
-	case BloomzBackend:
-		return ml.LoadModel(o.modelFile, bloomzLM)
-	case GPTJBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(GPTJBackend, o))
-	case DollyBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(DollyBackend, o))
-	case MPTBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(MPTBackend, o))
-	case Gpt2Backend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(Gpt2Backend, o))
-	case FalconBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(FalconBackend, o))
-	case GPTNeoXBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(GPTNeoXBackend, o))
-	case ReplitBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(ReplitBackend, o))
-	case StableDiffusionBackend:
-		return ml.LoadModel(o.modelFile, stableDiffusion)
-	case PiperBackend:
-		return ml.LoadModel(o.modelFile, piperTTS(filepath.Join(o.assetDir, "backend-assets", "espeak-ng-data")))
-	case StarcoderBackend:
-		return ml.LoadModel(o.modelFile, ml.grpcModel(StarcoderBackend, o))
+
+	backend := strings.ToLower(o.backendString)
+	switch backend {
+	case LlamaBackend, GPTJBackend, DollyBackend,
+		MPTBackend, Gpt2Backend, FalconBackend,
+		GPTNeoXBackend, ReplitBackend, StarcoderBackend, BloomzBackend,
+		RwkvBackend, LCHuggingFaceBackend, BertEmbeddingsBackend, FalconGGMLBackend, StableDiffusionBackend, WhisperBackend:
+		return ml.LoadModel(o.modelFile, ml.grpcModel(backend, o))
 	case Gpt4AllLlamaBackend, Gpt4AllMptBackend, Gpt4AllJBackend, Gpt4All:
 		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "gpt4all")
 		return ml.LoadModel(o.modelFile, ml.grpcModel(Gpt4All, o))
-	//	return ml.LoadModel(o.modelFile, gpt4allLM(gpt4all.SetThreads(int(o.threads)), gpt4all.SetLibrarySearchPath(filepath.Join(o.assetDir, "backend-assets", "gpt4all"))))
-	case BertEmbeddingsBackend:
-		return ml.LoadModel(o.modelFile, bertEmbeddings)
-	case RwkvBackend:
-		return ml.LoadModel(o.modelFile, rwkvLM(filepath.Join(ml.ModelPath, o.modelFile+tokenizerSuffix), o.threads))
-	case WhisperBackend:
-		return ml.LoadModel(o.modelFile, whisperModel)
-	case LCHuggingFaceBackend:
-		return ml.LoadModel(o.modelFile, lcHuggingFace)
+	case PiperBackend:
+		o.gRPCOptions.LibrarySearchPath = filepath.Join(o.assetDir, "backend-assets", "espeak-ng-data")
+		return ml.LoadModel(o.modelFile, ml.grpcModel(PiperBackend, o))
 	default:
 		return nil, fmt.Errorf("backend unsupported: %s", o.backendString)
 	}
 }

-func (ml *ModelLoader) GreedyLoader(opts ...Option) (interface{}, error) {
+func (ml *ModelLoader) GreedyLoader(opts ...Option) (*grpc.Client, error) {
 	o := NewOptions(opts...)

 	log.Debug().Msgf("Loading model '%s' greedly", o.modelFile)

+	// Is this really needed? BackendLoader already does this
 	ml.mu.Lock()
-	m, exists := ml.models[o.modelFile]
-	if exists {
+	if m := ml.checkIsLoaded(o.modelFile); m != nil {
 		log.Debug().Msgf("Model '%s' already loaded", o.modelFile)
 		ml.mu.Unlock()
 		return m, nil
@ -285,7 +226,7 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (interface{}, error) {
 		model, modelerr := ml.BackendLoader(
 			WithBackendString(b),
 			WithModelFile(o.modelFile),
-			WithLoadGRPCOpts(o.gRPCOptions),
+			WithLoadGRPCLLMModelOpts(o.gRPCOptions),
 			WithThreads(o.threads),
 			WithAssetDir(o.assetDir),
 		)
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@ -2,6 +2,7 @@ package model

 import (
 	"bytes"
+	"context"
 	"fmt"
 	"io/ioutil"
 	"os"
@ -10,6 +11,7 @@ import (
 	"sync"
 	"text/template"

+	"github.com/go-skynet/LocalAI/pkg/grpc"
 	process "github.com/mudler/go-processmanager"
 	"github.com/rs/zerolog/log"
 )
@ -18,7 +20,7 @@ type ModelLoader struct {
 	ModelPath string
 	mu        sync.Mutex
 	// TODO: this needs generics
-	models           map[string]interface{}
+	models           map[string]*grpc.Client
 	grpcProcesses    map[string]*process.Process
 	promptsTemplates map[string]*template.Template
 }
@ -26,7 +28,7 @@ type ModelLoader struct {
 func NewModelLoader(modelPath string) *ModelLoader {
 	return &ModelLoader{
 		ModelPath:        modelPath,
-		models:           make(map[string]interface{}),
+		models:           make(map[string]*grpc.Client),
 		promptsTemplates: make(map[string]*template.Template),
 		grpcProcesses:    make(map[string]*process.Process),
 	}
@ -113,14 +115,14 @@ func (ml *ModelLoader) loadTemplateIfExists(modelName, modelFile string) error {
 	return nil
 }

-func (ml *ModelLoader) LoadModel(modelName string, loader func(string) (interface{}, error)) (interface{}, error) {
+func (ml *ModelLoader) LoadModel(modelName string, loader func(string) (*grpc.Client, error)) (*grpc.Client, error) {
 	ml.mu.Lock()
 	defer ml.mu.Unlock()

 	// Check if we already have a loaded model
-	if m, ok := ml.models[modelName]; ok {
+	if model := ml.checkIsLoaded(modelName); model != nil {
 		log.Debug().Msgf("Model already loaded in memory: %s", modelName)
-		return m, nil
+		return model, nil
 	}

 	// Load the model and keep it in memory for later use
@ -140,3 +142,25 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string) (interfac
 	ml.models[modelName] = model
 	return model, nil
 }
+
+func (ml *ModelLoader) checkIsLoaded(s string) *grpc.Client {
+	if m, ok := ml.models[s]; ok {
+		log.Debug().Msgf("Model already loaded in memory: %s", s)
+
+		if !m.HealthCheck(context.Background()) {
+			log.Debug().Msgf("GRPC Model not responding", s)
+			if !ml.grpcProcesses[s].IsAlive() {
+				log.Debug().Msgf("GRPC Process is not responding", s)
+				// stop and delete the process, this forces to re-load the model and re-create again the service
+				ml.grpcProcesses[s].Stop()
+				delete(ml.grpcProcesses, s)
+				delete(ml.models, s)
+				return nil
+			}
+		}
+
+		return m
+	}
+
+	return nil
+}
--- a/pkg/model/options.go
+++ b/pkg/model/options.go
@ -1,6 +1,8 @@
 package model

 import (
+	"context"
+
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 )

@ -9,6 +11,7 @@ type Options struct {
 	modelFile     string
 	threads       uint32
 	assetDir      string
+	context       context.Context

 	gRPCOptions *pb.ModelOptions
 }
@ -27,7 +30,7 @@ func WithModelFile(modelFile string) Option {
 	}
 }

-func WithLoadGRPCOpts(opts *pb.ModelOptions) Option {
+func WithLoadGRPCLLMModelOpts(opts *pb.ModelOptions) Option {
 	return func(o *Options) {
 		o.gRPCOptions = opts
 	}
@ -45,8 +48,17 @@ func WithAssetDir(assetDir string) Option {
 	}
 }

+func WithContext(ctx context.Context) Option {
+	return func(o *Options) {
+		o.context = ctx
+	}
+}
+
 func NewOptions(opts ...Option) *Options {
-	o := &Options{}
+	o := &Options{
+		gRPCOptions: &pb.ModelOptions{},
+		context:     context.Background(),
+	}
 	for _, opt := range opts {
 		opt(o)
 	}
--- a/pkg/tts/generate.go
+++ b/pkg/tts/generate.go
@ -1,12 +0,0 @@
-//go:build tts
-// +build tts
-
-package tts
-
-import (
-	piper "github.com/mudler/go-piper"
-)
-
-func tts(text, model, assetDir, arLib, dst string) error {
-	return piper.TextToWav(text, model, assetDir, arLib, dst)
-}
--- a/pkg/tts/generate_unsupported.go
+++ b/pkg/tts/generate_unsupported.go
@ -1,10 +0,0 @@
-//go:build !tts
-// +build !tts
-
-package tts
-
-import "fmt"
-
-func tts(text, model, assetDir, arLib, dst string) error {
-	return fmt.Errorf("this version of LocalAI was built without the tts tag")
-}
--- a/pkg/tts/piper.go
+++ b/pkg/tts/piper.go
@ -1,20 +0,0 @@
-package tts
-
-import "os"
-
-type Piper struct {
-	assetDir string
-}
-
-func New(assetDir string) (*Piper, error) {
-	if _, err := os.Stat(assetDir); err != nil {
-		return nil, err
-	}
-	return &Piper{
-		assetDir: assetDir,
-	}, nil
-}
-
-func (s *Piper) TTS(text, model, dst string) error {
-	return tts(text, model, s.assetDir, "", dst)
-}