feat: Add UseFastTokenizer

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-05-25 13:04:59 +00:00 · 2023-08-08 01:10:05 +02:00 · 2023-08-08 01:10:05 +02:00 · 3c8fc37c56
commit 3c8fc37c56
parent 39805b09e5
10 changed files with 198 additions and 169 deletions
--- a/api/backend/options.go
+++ b/api/backend/options.go
@ -15,26 +15,27 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		b = c.Batch
 	}
 	return &pb.ModelOptions{
-		ContextSize:   int32(c.ContextSize),
-		Seed:          int32(c.Seed),
-		NBatch:        int32(b),
-		NGQA:          c.NGQA,
-		ModelBaseName: c.ModelBaseName,
-		Device:        c.Device,
-		UseTriton:     c.Triton,
-		RMSNormEps:    c.RMSNormEps,
-		F16Memory:     c.F16,
-		MLock:         c.MMlock,
-		RopeFreqBase:  c.RopeFreqBase,
-		RopeFreqScale: c.RopeFreqScale,
-		NUMA:          c.NUMA,
-		Embeddings:    c.Embeddings,
-		LowVRAM:       c.LowVRAM,
-		NGPULayers:    int32(c.NGPULayers),
-		MMap:          c.MMap,
-		MainGPU:       c.MainGPU,
-		Threads:       int32(c.Threads),
-		TensorSplit:   c.TensorSplit,
+		ContextSize:      int32(c.ContextSize),
+		Seed:             int32(c.Seed),
+		NBatch:           int32(b),
+		NGQA:             c.NGQA,
+		ModelBaseName:    c.ModelBaseName,
+		UseFastTokenizer: c.UseFastTokenizer,
+		Device:           c.Device,
+		UseTriton:        c.Triton,
+		RMSNormEps:       c.RMSNormEps,
+		F16Memory:        c.F16,
+		MLock:            c.MMlock,
+		RopeFreqBase:     c.RopeFreqBase,
+		RopeFreqScale:    c.RopeFreqScale,
+		NUMA:             c.NUMA,
+		Embeddings:       c.Embeddings,
+		LowVRAM:          c.LowVRAM,
+		NGPULayers:       int32(c.NGPULayers),
+		MMap:             c.MMap,
+		MainGPU:          c.MainGPU,
+		Threads:          int32(c.Threads),
+		TensorSplit:      c.TensorSplit,
 	}
 }

--- a/api/config/config.go
+++ b/api/config/config.go
@ -56,9 +56,10 @@ type Config struct {
 	NGQA       int32   `yaml:"ngqa"`

 	// AutoGPTQ
-	ModelBaseName string `yaml:"model_base_name"`
-	Device        string `yaml:"device"`
-	Triton        bool   `yaml:"triton"`
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

 type Functions struct {
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@ -39,4 +39,6 @@ type PredictionOptions struct {
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
 	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
+	// AutoGPTQ
+	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
 }
--- a/api/openai/request.go
+++ b/api/openai/request.go
@ -83,6 +83,10 @@ func updateConfig(config *config.Config, input *OpenAIRequest) {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}