mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-01 08:25:00 +00:00
models(gallery): add hermes-3-llama-3.1(8B,70B,405B) with vLLM (#3360)
models(gallery): add hermes-3-llama-3.1 with vLLM it adds 8b, 70b and 405b to the gallery Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
fbaae8528d
commit
a913fd310d
3 changed files with 152 additions and 0 deletions
91
gallery/hermes-vllm.yaml
Normal file
91
gallery/hermes-vllm.yaml
Normal file
|
@ -0,0 +1,91 @@
|
|||
---
|
||||
name: "hermes-vllm"
|
||||
|
||||
config_file: |
|
||||
backend: vllm
|
||||
context_size: 8192
|
||||
stopwords:
|
||||
- "<|im_end|>"
|
||||
- "<dummy32000>"
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
# Uncomment the line below to enable grammar matching for JSON results if the model is breaking
|
||||
# the output. This will make the model more accurate and won't break the JSON output.
|
||||
# This however, will make parallel_calls not functional (it is a known bug)
|
||||
# mixed_mode: true
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*)"
|
||||
capture_llm_results:
|
||||
- (?s)<scratchpad>(.*?)</scratchpad>
|
||||
replace_llm_results:
|
||||
- key: (?s)<scratchpad>(.*?)</scratchpad>
|
||||
value: ""
|
||||
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat_message: |
|
||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
||||
{{- if .FunctionCall }}
|
||||
<tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
<tool_response>
|
||||
{{- end }}
|
||||
{{- if .Content}}
|
||||
{{.Content }}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{- end }}
|
||||
{{- if .FunctionCall }}
|
||||
</tool_call>
|
||||
{{- else if eq .RoleName "tool" }}
|
||||
</tool_response>
|
||||
{{- end }}<|im_end|>
|
||||
completion: |
|
||||
{{.Input}}
|
||||
function: |
|
||||
<|im_start|>system
|
||||
You are a function calling AI model.
|
||||
Here are the available tools:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
You should call the tools provided to you sequentially
|
||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
||||
<scratchpad>
|
||||
{step-by-step reasoning and plan in bullet points}
|
||||
</scratchpad>
|
||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
||||
<tool_call>
|
||||
{"arguments": <args-dict>, "name": <function-name>}
|
||||
</tool_call><|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
# Uncomment to specify a quantization method (optional)
|
||||
# quantization: "awq"
|
||||
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
|
||||
# gpu_memory_utilization: 0.5
|
||||
# Uncomment to trust remote code from huggingface
|
||||
# trust_remote_code: true
|
||||
# Uncomment to enable eager execution
|
||||
# enforce_eager: true
|
||||
# Uncomment to specify the size of the CPU swap space per GPU (in GiB)
|
||||
# swap_space: 2
|
||||
# Uncomment to specify the maximum length of a sequence (including prompt and output)
|
||||
# max_model_len: 32768
|
||||
# Uncomment and specify the number of Tensor divisions.
|
||||
# Allows you to partition and run large models. Performance gains are limited.
|
||||
# https://github.com/vllm-project/vllm/issues/1435
|
||||
# tensor_parallel_size: 2
|
Loading…
Add table
Add a link
Reference in a new issue