feat(speculative-sampling): allow to specify a draft model in the model config (#1052)

**Description** This PR fixes #1013. It adds `draft_model` and `n_draft` to the model YAML config in order to load models with speculative sampling. This should be compatible as well with grammars. example: ```yaml backend: llama context_size: 1024 name: my-model-name parameters: model: foo-bar n_draft: 16 draft_model: model-name ``` --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-20 10:35:01 +00:00 · 2023-09-14 17:44:16 +02:00 · 2023-09-14 17:44:16 +02:00 · 8ccf5b2044
commit 8ccf5b2044
parent 247d85b523
12 changed files with 485 additions and 427 deletions
--- a/api/backend/options.go
+++ b/api/backend/options.go
@ -42,6 +42,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Seed:          int32(c.Seed),
 		NBatch:        int32(b),
 		NoMulMatQ:     c.NoMulMatQ,
+		DraftModel:    c.DraftModel,
 		AudioPath:     c.VallE.AudioPath,
 		LoraAdapter:   c.LoraAdapter,
 		LoraBase:      c.LoraBase,
@ -79,6 +80,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 	return &pb.PredictOptions{
 		Temperature:         float32(c.Temperature),
 		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
 		TopK:                int32(c.TopK),
 		Tokens:              int32(c.Maxtokens),
 		Threads:             int32(c.Threads),