feat(llama.cpp): add flash_attention and no_kv_offloading (#2310)

feat(llama.cpp): add flash_attn and no_kv_offload

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-05-13 19:07:51 +02:00 committed by GitHub
parent 7123d07456
commit e49ea0123b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 11 additions and 0 deletions

View file

@ -77,6 +77,8 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
MaxModelLen: int32(c.MaxModelLen),
TensorParallelSize: int32(c.TensorParallelSize),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
NoKVOffload: c.NoKVOffloading,
YarnExtFactor: c.YarnExtFactor,
YarnAttnFactor: c.YarnAttnFactor,
YarnBetaFast: c.YarnBetaFast,