From ec21b58008381accb28b20b03c0bb9f6526e1800 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Tue, 20 May 2025 11:15:09 +0200
Subject: [PATCH] chore(model gallery): add smolvlm-256m-instruct (#5412)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 gallery/index.yaml   | 30 ++++++++++++++++++++++++++++++
 gallery/smolvlm.yaml | 19 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 gallery/smolvlm.yaml
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 725b86bd..5553d790 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,34 @@
 ---
+- &smolvlm:
+  url: "github:mudler/LocalAI/gallery/smolvlm.yaml@master"
+  name: "smolvlm-256m-instruct"
+  icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM_256_banner.png
+  urls:
+    - https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct
+    - https://huggingface.co/ggml-org/SmolVLM-256M-Instruct-GGUF
+  license: apache-2.0
+  description: |
+    SmolVLM-256M is the smallest multimodal model in the world. It accepts arbitrary sequences of image and text inputs to produce text outputs. It's designed for efficiency. SmolVLM can answer questions about images, describe visual content, or transcribe text. Its lightweight architecture makes it suitable for on-device applications while maintaining strong performance on multimodal tasks. It can run inference on one image with under 1GB of GPU RAM.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - vision
+    - multimodal
+    - smollvlm
+    - image-to-text
+  overrides:
+    parameters:
+      model: SmolVLM-256M-Instruct-Q8_0.gguf
+    mmproj: mmproj-SmolVLM-256M-Instruct-Q8_0.gguf
+  files:
+    - filename: mmproj-SmolVLM-256M-Instruct-Q8_0.gguf
+      sha256: 7e943f7c53f0382a6fc41b6ee0c2def63ba4fded9ab8ed039cc9e2ab905e0edd
+      uri: huggingface://ggml-org/SmolVLM-256M-Instruct-GGUF/mmproj-SmolVLM-256M-Instruct-Q8_0.gguf
+    - filename: SmolVLM-256M-Instruct-Q8_0.gguf
+      sha256: 2a31195d3769c0b0fd0a4906201666108834848db768af11de1d2cef7cd35e65
+      uri: huggingface://ggml-org/SmolVLM-256M-Instruct-GGUF/SmolVLM-256M-Instruct-Q8_0.gguf
 - &qwen3
   url: "github:mudler/LocalAI/gallery/qwen3.yaml@master"
   name: "qwen3-30b-a3b"
diff --git a/gallery/smolvlm.yaml b/gallery/smolvlm.yaml
new file mode 100644
index 00000000..2c4ef47e
--- /dev/null
+++ b/gallery/smolvlm.yaml
@@ -0,0 +1,19 @@
+---
+name: smolvlm
+# yamllint disable-line rule:trailing-spaces
+config_file: |
+    mmap: true
+    template:
+      chat_message: |
+        {{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }}<end_of_utterance>
+      chat: "<|im_start|>\n{{.Input -}}\nAssistant: "
+      completion: |
+        {{-.Input}}
+    f16: true
+    stopwords:
+    - '<|im_end|>'
+    - '<dummy32000>'
+    - '</s>'
+    - '<|'
+    - '<end_of_utterance>'
+    - '<|endoftext|>'