From f8fbfd4fa36d27b0267fb2414076f78e9ba12f84 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Mon, 19 May 2025 17:31:38 +0200 Subject: [PATCH 01/12] chore(model gallery): add a-m-team_am-thinking-v1 (#5395) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index d02288dc..2a8bb2c3 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -7282,6 +7282,30 @@ - filename: mmproj-Qwen_Qwen2.5-VL-72B-Instruct-f16.gguf sha256: 6099885b9c4056e24806b616401ff2730a7354335e6f2f0eaf2a45e89c8a457c uri: https://huggingface.co/bartowski/Qwen_Qwen2.5-VL-72B-Instruct-GGUF/resolve/main/mmproj-Qwen_Qwen2.5-VL-72B-Instruct-f16.gguf +- !!merge <<: *qwen25 + name: "a-m-team_am-thinking-v1" + icon: https://cdn-avatars.huggingface.co/v1/production/uploads/62da53284398e21bf7f0d539/y6wX4K-P9O8B9frsxxQ6W.jpeg + urls: + - https://huggingface.co/a-m-team/AM-Thinking-v1 + - https://huggingface.co/bartowski/a-m-team_AM-Thinking-v1-GGUF + description: | + AM-Thinking‑v1, a 32B dense language model focused on enhancing reasoning capabilities. Built on Qwen 2.5‑32B‑Base, AM-Thinking‑v1 shows strong performance on reasoning benchmarks, comparable to much larger MoE models like DeepSeek‑R1, Qwen3‑235B‑A22B, Seed1.5-Thinking, and larger dense model like Nemotron-Ultra-253B-v1. + benchmark + 🧩 Why Another 32B Reasoning Model Matters? + + Large Mixture‑of‑Experts (MoE) models such as DeepSeek‑R1 or Qwen3‑235B‑A22B dominate leaderboards—but they also demand clusters of high‑end GPUs. Many teams just need the best dense model that fits on a single card. AM‑Thinking‑v1 fills that gap while remaining fully based on open-source components: + + Outperforms DeepSeek‑R1 on AIME’24/’25 & LiveCodeBench and approaches Qwen3‑235B‑A22B despite being 1/7‑th the parameter count. + Built on the publicly available Qwen 2.5‑32B‑Base, as well as the RL training queries. + Shows that with a well‑designed post‑training pipeline ( SFT + dual‑stage RL ) you can squeeze flagship‑level reasoning out of a 32 B dense model. + Deploys on one A100‑80 GB with deterministic latency—no MoE routing overhead. + overrides: + parameters: + model: a-m-team_AM-Thinking-v1-Q4_K_M.gguf + files: + - filename: a-m-team_AM-Thinking-v1-Q4_K_M.gguf + sha256: a6da6e8d330d76167c04a54eeb550668b59b613ea53af22e3b4a0c6da271e38d + uri: huggingface://bartowski/a-m-team_AM-Thinking-v1-GGUF/a-m-team_AM-Thinking-v1-Q4_K_M.gguf - &llama31 url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1 icon: https://avatars.githubusercontent.com/u/153379578 From f2942cc0e196a50cf76c8bffecb0fdd75d18128c Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 10:28:27 +0200 Subject: [PATCH 02/12] chore(model gallery): add thedrummer_valkyrie-49b-v1 (#5410) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 2a8bb2c3..8443a8b0 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -2568,6 +2568,39 @@ - filename: L3.3-Genetic-Lemonade-Sunset-70B.Q4_K_M.gguf sha256: 743c11180c0c9168c0fe31a97f9d2efe0dd749c2797d749821fcb1d6932c19f7 uri: huggingface://mradermacher/L3.3-Genetic-Lemonade-Sunset-70B-GGUF/L3.3-Genetic-Lemonade-Sunset-70B.Q4_K_M.gguf +- !!merge <<: *llama33 + name: "thedrummer_valkyrie-49b-v1" + icon: https://cdn-uploads.huggingface.co/production/uploads/65f2fd1c25b848bd061b5c2e/8I-AvB0bFSoEcxlLU7dtY.png + urls: + - https://huggingface.co/TheDrummer/Valkyrie-49B-v1 + - https://huggingface.co/bartowski/TheDrummer_Valkyrie-49B-v1-GGUF + description: | + it swears unprompted 10/10 model + + ... characters work well, groups work well, scenarios also work really well so great model overall + + This is pretty exciting though. GLM-4 already had me on the verge of deleting all of my other 32b and lower models. I got to test this more but I think this model at Q3m is the death blow lol + + Smart Nemotron 49b learned how to roleplay + + Even without thinking it rock solid at 4qm. + + Without thinking is like 40-70b level. With thinking is 100+b level + + This model would have been AGI if it were named properly with a name like "Bob". Alas, it was not. + + I think this model is nice. It follows prompts very well. I didn't really note any major issues or repetition + + Yeah this is good. I think its clearly smart enough, close to the other L3.3 70b models. It follows directions and formatting very well. I asked it to create the intro message, my first response was formatted differently, and it immediately followed my format on the second message. I also have max tokens at 2k cause I like the model to finish it's thought. But I started trimming the models responses when I felt the last bit was unnecessary and it started replying closer to that length. It's pretty much uncensored. + + Nemotron is my favorite model, and I think you fixed it!! + overrides: + parameters: + model: TheDrummer_Valkyrie-49B-v1-Q4_K_M.gguf + files: + - filename: TheDrummer_Valkyrie-49B-v1-Q4_K_M.gguf + sha256: f50be1eef41e0da2cb59e4b238f4f178ee1000833270b337f97f91572c31b752 + uri: huggingface://bartowski/TheDrummer_Valkyrie-49B-v1-GGUF/TheDrummer_Valkyrie-49B-v1-Q4_K_M.gguf - &rwkv url: "github:mudler/LocalAI/gallery/rwkv.yaml@master" name: "rwkv-6-world-7b" From 996259b5296acbd134c7d4406bfc1e28f233c11b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 10:31:09 +0200 Subject: [PATCH 03/12] chore(model gallery): add facebook_kernelllm (#5411) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 8443a8b0..725b86bd 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -9564,6 +9564,25 @@ - filename: nvidia_Llama-3.1-8B-UltraLong-4M-Instruct-Q4_K_M.gguf sha256: c503c77c6d8cc4be53ce7cddb756cb571862f0422594c17e58a75d7be9f00907 uri: huggingface://bartowski/nvidia_Llama-3.1-8B-UltraLong-4M-Instruct-GGUF/nvidia_Llama-3.1-8B-UltraLong-4M-Instruct-Q4_K_M.gguf +- !!merge <<: *llama31 + name: "facebook_kernelllm" + icon: https://cdn-avatars.huggingface.co/v1/production/uploads/1592839207516-noauth.png + urls: + - https://huggingface.co/facebook/KernelLLM + - https://huggingface.co/bartowski/facebook_KernelLLM-GGUF + description: | + We introduce KernelLLM, a large language model based on Llama 3.1 Instruct, which has been trained specifically for the task of authoring GPU kernels using Triton. KernelLLM translates PyTorch modules into Triton kernels and was evaluated on KernelBench-Triton (see here). KernelLLM aims to democratize GPU programming by making kernel development more accessible and efficient. + KernelLLM's vision is to meet the growing demand for high-performance GPU kernels by automating the generation of efficient Triton implementations. As workloads grow larger and more diverse accelerator architectures emerge, the need for tailored kernel solutions has increased significantly. Although a number of works exist, most of them are limited to test-time optimization, while others tune on solutions traced of KernelBench problems itself, thereby limiting the informativeness of the results towards out-of-distribution generalization. To the best of our knowledge KernelLLM is the first LLM finetuned on external (torch, triton) pairs, and we hope that making our model available can accelerate progress towards intelligent kernel authoring systems. + KernelLLM Workflow for Triton Kernel Generation: Our approach uses KernelLLM to translate PyTorch code (green) into Triton kernel candidates. Input and output components are marked in bold. The generations are validated against unit tests, which run kernels with random inputs of known shapes. This workflow allows us to evaluate multiple generations (pass@k) by increasing the number of kernel candidate generations. The best kernel implementation is selected and returned (green output). + The model was trained on approximately 25,000 paired examples of PyTorch modules and their equivalent Triton kernel implementations, and additional synthetically generated samples. Our approach combines filtered code from TheStack [Kocetkov et al. 2022] and synthetic examples generated through torch.compile() and additional prompting techniques. The filtered and compiled dataset is [KernelBook]](https://huggingface.co/datasets/GPUMODE/KernelBook). + We finetuned Llama3.1-8B-Instruct on the created dataset using supervised instruction tuning and measured its ability to generate correct Triton kernels and corresponding calling code on KernelBench-Triton, our newly created variant of KernelBench [Ouyang et al. 2025] targeting Triton kernel generation. The torch code was used with a prompt template containing a format example as instruction during both training and evaluation. The model was trained for 10 epochs with a batch size of 32 and a standard SFT recipe with hyperparameters selected by perplexity on a held-out subset of the training data. Training took circa 12 hours wall clock time on 16 GPUs (192 GPU hours), and we report the best checkpoint's validation results. + overrides: + parameters: + model: facebook_KernelLLM-Q4_K_M.gguf + files: + - filename: facebook_KernelLLM-Q4_K_M.gguf + sha256: 947e1f4d48d23bf9a71984b98de65204858ec4e58990c17ef6195dc64838e6d7 + uri: huggingface://bartowski/facebook_KernelLLM-GGUF/facebook_KernelLLM-Q4_K_M.gguf - !!merge <<: *llama33 name: "llama-3.3-magicalgirl-2.5-i1" icon: https://cdn-uploads.huggingface.co/production/uploads/633e85093a17ab61de8d9073/FGK0qBGmELj6DEUxbbrdR.png From ec21b58008381accb28b20b03c0bb9f6526e1800 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 11:15:09 +0200 Subject: [PATCH 04/12] chore(model gallery): add smolvlm-256m-instruct (#5412) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 30 ++++++++++++++++++++++++++++++ gallery/smolvlm.yaml | 19 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 gallery/smolvlm.yaml diff --git a/gallery/index.yaml b/gallery/index.yaml index 725b86bd..5553d790 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,34 @@ --- +- &smolvlm: + url: "github:mudler/LocalAI/gallery/smolvlm.yaml@master" + name: "smolvlm-256m-instruct" + icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM_256_banner.png + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct + - https://huggingface.co/ggml-org/SmolVLM-256M-Instruct-GGUF + license: apache-2.0 + description: | + SmolVLM-256M is the smallest multimodal model in the world. It accepts arbitrary sequences of image and text inputs to produce text outputs. It's designed for efficiency. SmolVLM can answer questions about images, describe visual content, or transcribe text. Its lightweight architecture makes it suitable for on-device applications while maintaining strong performance on multimodal tasks. It can run inference on one image with under 1GB of GPU RAM. + tags: + - llm + - gguf + - gpu + - cpu + - vision + - multimodal + - smollvlm + - image-to-text + overrides: + parameters: + model: SmolVLM-256M-Instruct-Q8_0.gguf + mmproj: mmproj-SmolVLM-256M-Instruct-Q8_0.gguf + files: + - filename: mmproj-SmolVLM-256M-Instruct-Q8_0.gguf + sha256: 7e943f7c53f0382a6fc41b6ee0c2def63ba4fded9ab8ed039cc9e2ab905e0edd + uri: huggingface://ggml-org/SmolVLM-256M-Instruct-GGUF/mmproj-SmolVLM-256M-Instruct-Q8_0.gguf + - filename: SmolVLM-256M-Instruct-Q8_0.gguf + sha256: 2a31195d3769c0b0fd0a4906201666108834848db768af11de1d2cef7cd35e65 + uri: huggingface://ggml-org/SmolVLM-256M-Instruct-GGUF/SmolVLM-256M-Instruct-Q8_0.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" diff --git a/gallery/smolvlm.yaml b/gallery/smolvlm.yaml new file mode 100644 index 00000000..2c4ef47e --- /dev/null +++ b/gallery/smolvlm.yaml @@ -0,0 +1,19 @@ +--- +name: smolvlm +# yamllint disable-line rule:trailing-spaces +config_file: | + mmap: true + template: + chat_message: | + {{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }} + chat: "<|im_start|>\n{{.Input -}}\nAssistant: " + completion: | + {{-.Input}} + f16: true + stopwords: + - '<|im_end|>' + - '' + - '' + - '<|' + - '' + - '<|endoftext|>' From 1db51044bbeb336aa6d7dc2c159c99352e9e8744 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 11:25:32 +0200 Subject: [PATCH 05/12] chore(model gallery): add smolvlm-500m-instruct (#5413) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 5553d790..7b4299a1 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -29,6 +29,24 @@ - filename: SmolVLM-256M-Instruct-Q8_0.gguf sha256: 2a31195d3769c0b0fd0a4906201666108834848db768af11de1d2cef7cd35e65 uri: huggingface://ggml-org/SmolVLM-256M-Instruct-GGUF/SmolVLM-256M-Instruct-Q8_0.gguf +- !!merge <<: *smolvlm + name: "smolvlm-500m-instruct" + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM-500M-Instruct + - https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF + description: | + SmolVLM-500M is a tiny multimodal model, member of the SmolVLM family. It accepts arbitrary sequences of image and text inputs to produce text outputs. It's designed for efficiency. SmolVLM can answer questions about images, describe visual content, or transcribe text. Its lightweight architecture makes it suitable for on-device applications while maintaining strong performance on multimodal tasks. It can run inference on one image with 1.23GB of GPU RAM. + overrides: + parameters: + model: SmolVLM-500M-Instruct-Q8_0.gguf + mmproj: mmproj-SmolVLM-500M-Instruct-Q8_0.gguf + files: + - filename: mmproj-SmolVLM-500M-Instruct-Q8_0.gguf + sha256: d1eb8b6b23979205fdf63703ed10f788131a3f812c7b1f72e0119d5d81295150 + uri: huggingface://ggml-org/SmolVLM-500M-Instruct-GGUF/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf + - filename: SmolVLM-500M-Instruct-Q8_0.gguf + sha256: 9d4612de6a42214499e301494a3ecc2be0abdd9de44e663bda63f1152fad1bf4 + uri: huggingface://ggml-org/SmolVLM-500M-Instruct-GGUF/SmolVLM-500M-Instruct-Q8_0.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" From 8caaf49f5dea331626b24e88a076774ce99198a9 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 11:35:01 +0200 Subject: [PATCH 06/12] chore(model gallery): add smolvlm-instruct (#5414) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 7b4299a1..601de0f3 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -47,6 +47,25 @@ - filename: SmolVLM-500M-Instruct-Q8_0.gguf sha256: 9d4612de6a42214499e301494a3ecc2be0abdd9de44e663bda63f1152fad1bf4 uri: huggingface://ggml-org/SmolVLM-500M-Instruct-GGUF/SmolVLM-500M-Instruct-Q8_0.gguf +- !!merge <<: *smolvlm + name: "smolvlm-instruct" + icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM.png + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct + - https://huggingface.co/ggml-org/SmolVLM-Instruct-GGUF + description: | + SmolVLM is a compact open multimodal model that accepts arbitrary sequences of image and text inputs to produce text outputs. Designed for efficiency, SmolVLM can answer questions about images, describe visual content, create stories grounded on multiple images, or function as a pure language model without visual inputs. Its lightweight architecture makes it suitable for on-device applications while maintaining strong performance on multimodal tasks. + overrides: + parameters: + model: SmolVLM-Instruct-Q4_K_M.gguf + mmproj: mmproj-SmolVLM-Instruct-Q8_0.gguf + files: + - filename: SmolVLM-Instruct-Q4_K_M.gguf + sha256: dc80966bd84789de64115f07888939c03abb1714d431c477dfb405517a554af5 + uri: https://huggingface.co/ggml-org/SmolVLM-Instruct-GGUF/resolve/main/SmolVLM-Instruct-Q4_K_M.gguf + - filename: mmproj-SmolVLM-Instruct-Q8_0.gguf + sha256: 86b84aa7babf1ab51a6366d973b9d380354e92c105afaa4f172cc76d044da739 + uri: https://huggingface.co/ggml-org/SmolVLM-Instruct-GGUF/resolve/main/mmproj-SmolVLM-Instruct-Q8_0.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" From 6bc2ae5467c2397ca5bc62246e2aed780163817b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 11:36:22 +0200 Subject: [PATCH 07/12] chore(model gallery): add smolvlm2-2.2b-instruct (#5415) chore(model gallery): add smolvlm-instruct Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 601de0f3..6a470e9b 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -66,6 +66,25 @@ - filename: mmproj-SmolVLM-Instruct-Q8_0.gguf sha256: 86b84aa7babf1ab51a6366d973b9d380354e92c105afaa4f172cc76d044da739 uri: https://huggingface.co/ggml-org/SmolVLM-Instruct-GGUF/resolve/main/mmproj-SmolVLM-Instruct-Q8_0.gguf +- !!merge <<: *smolvlm + name: "smolvlm2-2.2b-instruct" + icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM2_banner.png + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct + - https://huggingface.co/ggml-org/SmolVLM2-2.2B-Instruct-GGUF + description: | + SmolVLM2-2.2B is a lightweight multimodal model designed to analyze video content. The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 5.2GB of GPU RAM for video inference, it delivers robust performance on complex multimodal tasks. This efficiency makes it particularly well-suited for on-device applications where computational resources may be limited. + overrides: + parameters: + model: SmolVLM2-2.2B-Instruct-Q4_K_M.gguf + mmproj: mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf + files: + - filename: SmolVLM2-2.2B-Instruct-Q4_K_M.gguf + sha256: 0cf76814555b8665149075b74ab6b5c1d428ea1d3d01c1918c12012e8d7c9f58 + uri: huggingface://ggml-org/SmolVLM2-2.2B-Instruct-GGUF/SmolVLM2-2.2B-Instruct-Q4_K_M.gguf + - filename: mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf + sha256: ae07ea1facd07dd3230c4483b63e8cda96c6944ad2481f33d531f79e892dd024 + uri: huggingface://ggml-org/SmolVLM2-2.2B-Instruct-GGUF/mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" From e0a54de4f52d661311f57a853787ebfd5d250565 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 11:42:30 +0200 Subject: [PATCH 08/12] chore(model gallery): add smolvlm2-500m-video-instruct (#5416) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 6a470e9b..077c1267 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -85,6 +85,27 @@ - filename: mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf sha256: ae07ea1facd07dd3230c4483b63e8cda96c6944ad2481f33d531f79e892dd024 uri: huggingface://ggml-org/SmolVLM2-2.2B-Instruct-GGUF/mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf +- !!merge <<: *smolvlm + name: "smolvlm2-500m-video-instruct" + icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM2_banner.png + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct + - https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF + description: | + SmolVLM2-500M-Video is a lightweight multimodal model designed to analyze video content. + The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 1.8GB of GPU RAM for video inference, it delivers robust performance on complex multimodal tasks. + This efficiency makes it particularly well-suited for on-device applications where computational resources may be limited. + overrides: + parameters: + model: SmolVLM2-500M-Video-Instruct-f16.gguf + mmproj: mmproj-SmolVLM2-500M-Video-Instruct-f16.gguf + files: + - filename: SmolVLM2-500M-Video-Instruct-f16.gguf + sha256: 80f7e3f04bc2d3324ac1a9f52f5776fe13a69912adf74f8e7edacf773d140d77 + uri: huggingface://ggml-org/SmolVLM2-500M-Video-Instruct-GGUF/SmolVLM2-500M-Video-Instruct-f16.gguf + - filename: mmproj-SmolVLM2-500M-Video-Instruct-f16.gguf + sha256: b5dc8ebe7cbeab66a5369693960a52515d7824f13d4063ceca78431f2a6b59b0 + uri: huggingface://ggml-org/SmolVLM2-500M-Video-Instruct-GGUF/mmproj-SmolVLM2-500M-Video-Instruct-f16.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" From 0d590a4044b4b4fec6046c54213eaeb8216147cf Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 12:03:02 +0200 Subject: [PATCH 09/12] chore(model gallery): add smolvlm2-256m-video-instruct (#5417) Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gallery/index.yaml b/gallery/index.yaml index 077c1267..75046349 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -106,6 +106,25 @@ - filename: mmproj-SmolVLM2-500M-Video-Instruct-f16.gguf sha256: b5dc8ebe7cbeab66a5369693960a52515d7824f13d4063ceca78431f2a6b59b0 uri: huggingface://ggml-org/SmolVLM2-500M-Video-Instruct-GGUF/mmproj-SmolVLM2-500M-Video-Instruct-f16.gguf +- !!merge <<: *smolvlm + name: "smolvlm2-256m-video-instruct" + icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM2_banner.png + urls: + - https://huggingface.co/HuggingFaceTB/SmolVLM2-256M-Video-Instruct + - https://huggingface.co/ggml-org/SmolVLM2-256M-Video-Instruct-GGUF + description: | + SmolVLM2-256M-Video is a lightweight multimodal model designed to analyze video content. The model processes videos, images, and text inputs to generate text outputs - whether answering questions about media files, comparing visual content, or transcribing text from images. Despite its compact size, requiring only 1.38GB of GPU RAM for video inference. This efficiency makes it particularly well-suited for on-device applications that require specific domain fine-tuning and computational resources may be limited. + overrides: + parameters: + model: SmolVLM2-256M-Video-Instruct-Q8_0.gguf + mmproj: mmproj-SmolVLM2-256M-Video-Instruct-Q8_0.gguf + files: + - filename: SmolVLM2-256M-Video-Instruct-Q8_0.gguf + sha256: af7ce9951a2f46c4f6e5def253e5b896ca5e417010e7a9949fdc9e5175c27767 + uri: huggingface://ggml-org/SmolVLM2-256M-Video-Instruct-GGUF/SmolVLM2-256M-Video-Instruct-Q8_0.gguf + - filename: mmproj-SmolVLM2-256M-Video-Instruct-Q8_0.gguf + sha256: d34913a588464ff7215f086193e0426a4f045eaba74456ee5e2667d8ed6798b1 + uri: huggingface://ggml-org/SmolVLM2-256M-Video-Instruct-GGUF/mmproj-SmolVLM2-256M-Video-Instruct-Q8_0.gguf - &qwen3 url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" name: "qwen3-30b-a3b" From 9af09b3f8ca17ea35adcb4d259519eef6e8d986c Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 12:17:21 +0200 Subject: [PATCH 10/12] chore(model gallery): fixup Signed-off-by: Ettore Di Giacinto --- gallery/index.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery/index.yaml b/gallery/index.yaml index 75046349..7793af81 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,5 +1,5 @@ --- -- &smolvlm: +- &smolvlm url: "github:mudler/LocalAI/gallery/smolvlm.yaml@master" name: "smolvlm-256m-instruct" icon: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/SmolVLM_256_banner.png From 04a3d8e5acdeccf9818549cd21dc665534424e61 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 12:17:27 +0200 Subject: [PATCH 11/12] feat(ui): add error page to display errors (#5418) Signed-off-by: Ettore Di Giacinto --- core/http/endpoints/localai/gallery.go | 1 + core/http/routes/ui.go | 12 +++++- core/http/views/error.html | 56 ++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 core/http/views/error.html diff --git a/core/http/endpoints/localai/gallery.go b/core/http/endpoints/localai/gallery.go index 9dc99f5d..c2710991 100644 --- a/core/http/endpoints/localai/gallery.go +++ b/core/http/endpoints/localai/gallery.go @@ -120,6 +120,7 @@ func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *f models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath) if err != nil { + log.Error().Err(err).Msg("could not list models from galleries") return err } diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go index 373a983b..7cfb1aa0 100644 --- a/core/http/routes/ui.go +++ b/core/http/routes/ui.go @@ -131,7 +131,17 @@ func RegisterUIRoutes(app *fiber.App, page := c.Query("page") items := c.Query("items") - models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath) + models, err := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath) + if err != nil { + log.Error().Err(err).Msg("could not list models from galleries") + return c.Status(fiber.StatusInternalServerError).Render("views/error", fiber.Map{ + "Title": "LocalAI - Models", + "BaseURL": utils.BaseURL(c), + "Version": internal.PrintableVersion(), + "ErrorCode": "500", + "ErrorMessage": err.Error(), + }) + } // Get all available tags allTags := map[string]struct{}{} diff --git a/core/http/views/error.html b/core/http/views/error.html new file mode 100644 index 00000000..83ce0438 --- /dev/null +++ b/core/http/views/error.html @@ -0,0 +1,56 @@ + + +{{template "views/partials/head" .}} + + +
+ + {{template "views/partials/navbar" .}} + +
+ +
+
+
+ +
+

+ + {{if .ErrorCode}}{{.ErrorCode}}{{else}}Error{{end}} + +

+

{{if .ErrorMessage}}{{.ErrorMessage}}{{else}}An unexpected error occurred{{end}}

+ +
+
+ + +
+
+
+ +
+

Need help?

+

Visit our 🖼️ Gallery or check the Getting started documentation

+
+
+
+ + {{template "views/partials/footer" .}} +
+ + + \ No newline at end of file From 82811a963016a7e39782efdd1d22058376407a15 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 May 2025 20:28:31 +0200 Subject: [PATCH 12/12] fix(transformers): pin protobuf (#5421) Signed-off-by: Ettore Di Giacinto --- backend/python/transformers/requirements-cpu.txt | 3 ++- backend/python/transformers/requirements-cublas11.txt | 1 + backend/python/transformers/requirements-cublas12.txt | 1 + backend/python/transformers/requirements-hipblas.txt | 1 + backend/python/transformers/requirements-intel.txt | 1 + backend/python/transformers/requirements.txt | 2 +- 6 files changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/python/transformers/requirements-cpu.txt b/backend/python/transformers/requirements-cpu.txt index 79863c2b..065c5c2d 100644 --- a/backend/python/transformers/requirements-cpu.txt +++ b/backend/python/transformers/requirements-cpu.txt @@ -5,4 +5,5 @@ accelerate transformers bitsandbytes outetts -sentence-transformers==3.4.1 \ No newline at end of file +sentence-transformers==3.4.1 +protobuf==6.31.0 \ No newline at end of file diff --git a/backend/python/transformers/requirements-cublas11.txt b/backend/python/transformers/requirements-cublas11.txt index fa9f8953..176eb26c 100644 --- a/backend/python/transformers/requirements-cublas11.txt +++ b/backend/python/transformers/requirements-cublas11.txt @@ -7,3 +7,4 @@ transformers bitsandbytes outetts sentence-transformers==3.4.1 +protobuf==6.31.0 \ No newline at end of file diff --git a/backend/python/transformers/requirements-cublas12.txt b/backend/python/transformers/requirements-cublas12.txt index 127bfb21..373e18a1 100644 --- a/backend/python/transformers/requirements-cublas12.txt +++ b/backend/python/transformers/requirements-cublas12.txt @@ -6,3 +6,4 @@ transformers bitsandbytes outetts sentence-transformers==3.4.1 +protobuf==6.31.0 \ No newline at end of file diff --git a/backend/python/transformers/requirements-hipblas.txt b/backend/python/transformers/requirements-hipblas.txt index c0ca93ee..ba0d4ea1 100644 --- a/backend/python/transformers/requirements-hipblas.txt +++ b/backend/python/transformers/requirements-hipblas.txt @@ -8,3 +8,4 @@ bitsandbytes outetts bitsandbytes sentence-transformers==3.4.1 +protobuf==6.31.0 \ No newline at end of file diff --git a/backend/python/transformers/requirements-intel.txt b/backend/python/transformers/requirements-intel.txt index 1418a3c3..6e6116c3 100644 --- a/backend/python/transformers/requirements-intel.txt +++ b/backend/python/transformers/requirements-intel.txt @@ -9,3 +9,4 @@ intel-extension-for-transformers bitsandbytes outetts sentence-transformers==3.4.1 +protobuf==6.31.0 \ No newline at end of file diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt index ce8bfd6c..062c4139 100644 --- a/backend/python/transformers/requirements.txt +++ b/backend/python/transformers/requirements.txt @@ -1,5 +1,5 @@ grpcio==1.72.0 -protobuf +protobuf==6.31.0 certifi setuptools scipy==1.15.1