feat(llama.cpp): add flash_attention and no_kv_offloading (#2310)

feat(llama.cpp): add flash_attn and no_kv_offload

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-05-13 19:07:51 +02:00 committed by GitHub
parent 7123d07456
commit e49ea0123b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 11 additions and 0 deletions

View file

@ -132,6 +132,9 @@ type LLMConfig struct {
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
MMProj string `yaml:"mmproj"`
FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
RopeScaling string `yaml:"rope_scaling"`
ModelType string `yaml:"type"`