feat(llama.cpp): expose cache_type_k and cache_type_v for quant of kv cache (#4329)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-12-06 10:23:59 +01:00 committed by GitHub
parent 88737e1d76
commit d4c1746c7d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 15 additions and 2 deletions

View file

@ -151,6 +151,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
TensorParallelSize: int32(c.TensorParallelSize),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,
CacheTypeValue: c.CacheTypeV,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,