feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache (#4329)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-12-06 10:23:59 +01:00 committed by GitHub
parent 88737e1d76
commit d4c1746c7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 15 additions and 2 deletions

View file

@ -155,8 +155,10 @@ type LLMConfig struct {
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
MMProj string `yaml:"mmproj"`
FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
CacheTypeK string `yaml:"cache_type_k"`
CacheTypeV string `yaml:"cache_type_v"`
RopeScaling string `yaml:"rope_scaling"`
ModelType string `yaml:"type"`