mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-30 23:44:59 +00:00
models(gallery): add hermes-3-llama-3.1(8B,70B,405B) with vLLM (#3360)
models(gallery): add hermes-3-llama-3.1 with vLLM it adds 8b, 70b and 405b to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
fbaae8528d
commit
a913fd310d
3 changed files with 152 additions and 0 deletions
29
gallery/vllm.yaml
Normal file
29
gallery/vllm.yaml
Normal file
|
@ -0,0 +1,29 @@
|
|||
---
|
||||
name: "vllm"
|
||||
|
||||
config_file: |
|
||||
backend: vllm
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
# Uncomment to specify a quantization method (optional)
|
||||
# quantization: "awq"
|
||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||
# gpu_memory_utilization: 0.5
|
||||
# Uncomment to trust remote code from huggingface
|
||||
# trust_remote_code: true
|
||||
# Uncomment to enable eager execution
|
||||
# enforce_eager: true
|
||||
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
|
||||
# swap_space: 2
|
||||
# Uncomment to specify the maximum length of a sequence (including prompt and output)
|
||||
# max_model_len: 32768
|
||||
# Uncomment and specify the number of Tensor divisions.
|
||||
# Allows you to partition and run large models. Performance gains are limited.
|
||||
# https://github.com/vllm-project/vllm/issues/1435
|
||||
# tensor_parallel_size: 2
|
Loading…
Add table
Add a link
Reference in a new issue