transformers: correctly load automodels (#1643)

* backends(transformers): use AutoModel with LLM types * examples: animagine-xl * Add codellama examples
2025-05-20 10:35:01 +00:00 · 2024-01-26 00:13:21 +01:00 · 2024-01-26 00:13:21 +01:00 · cb7512734d
commit cb7512734d
parent 3733250b3c
27 changed files with 1144 additions and 569 deletions
--- a/embedded/models/animagine-xl.yaml
+++ b/embedded/models/animagine-xl.yaml
@ -0,0 +1,17 @@
+name: animagine-xl
+parameters:
+  model: Linaqruf/animagine-xl
+backend: diffusers
+f16: true
+diffusers:
+  scheduler_type: euler_a
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "model": "animagine-xl",
+            "step": 51,
+            "size": "1024x1024"
+          }'
--- a/embedded/models/codellama-7b-gguf.yaml
+++ b/embedded/models/codellama-7b-gguf.yaml
@ -0,0 +1,16 @@
+name: codellama-7b-gguf
+backend: transformers
+parameters:
+  model: huggingface://TheBloke/CodeLlama-7B-GGUF/codellama-7b.Q4_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  seed: -1
+  top_p: 0.95
+context_size: 4096
+f16: true
+gpu_layers: 90
+usage: |
+      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+          "model": "codellama-7b-gguf",
+          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
+      }'
--- a/embedded/models/codellama-7b.yaml
+++ b/embedded/models/codellama-7b.yaml
@ -0,0 +1,14 @@
+name: codellama-7b
+backend: transformers
+parameters:
+  model: codellama/CodeLlama-7b-hf
+  temperature: 0.2
+  top_k: 40
+  seed: -1
+  top_p: 0.95
+
+usage: |
+      curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
+          "model": "codellama-7b",
+          "prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
+      }'
--- a/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
+++ b/embedded/models/dolphin-2.5-mixtral-8x7b.yaml
@ -1,7 +1,7 @@
 name: dolphin-mixtral-8x7b
 mmap: true
 parameters:
-  model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/blob/main/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
+  model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
  temperature: 0.2
  top_k: 40
  top_p: 0.95
--- a/embedded/models/transformers-tinyllama.yaml
+++ b/embedded/models/transformers-tinyllama.yaml
@ -0,0 +1,32 @@
+name: tinyllama-chat
+backend: transformers
+type: AutoModelForCausalLM
+
+parameters:
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  temperature: 0.2
+  top_k: 40
+  seed: -1
+  top_p: 0.95
+  max_tokens: 4096
+
+template:
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
+    {{if .Content}}{{.Content}}{{end}}<|im_end|>
+  chat: |
+    {{.Input}}
+    <|im_start|>assistant
+    
+  completion: |
+    {{.Input}}
+
+stopwords:
+- <|im_end|>
+
+usage: |
+      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "tinyllama-chat",
+        "messages": [{"role": "user", "content": "Say this is a test!"}],
+        "temperature": 0.7
+      }'