feat(speculative-sampling): allow to specify a draft model in the model config (#1052)

**Description**

This PR fixes #1013.

It adds `draft_model` and `n_draft` to the model YAML config in order to
load models with speculative sampling. This should be compatible as well
with grammars.

example:

```yaml
backend: llama                                                                                                                                                                   
context_size: 1024                                                                                                                                                                        
name: my-model-name
parameters:
  model: foo-bar
n_draft: 16                                                                                                                                                                      
draft_model: model-name
```

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2023-09-14 17:44:16 +02:00 committed by GitHub
parent 247d85b523
commit 8ccf5b2044
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 485 additions and 427 deletions

View file

@ -42,6 +42,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
Seed: int32(c.Seed),
NBatch: int32(b),
NoMulMatQ: c.NoMulMatQ,
DraftModel: c.DraftModel,
AudioPath: c.VallE.AudioPath,
LoraAdapter: c.LoraAdapter,
LoraBase: c.LoraBase,
@ -79,6 +80,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
return &pb.PredictOptions{
Temperature: float32(c.Temperature),
TopP: float32(c.TopP),
NDraft: c.NDraft,
TopK: int32(c.TopK),
Tokens: int32(c.Maxtokens),
Threads: int32(c.Threads),