feat(ui): add error page to display errors (#5418 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore(model gallery): fixup
2025-05-20 10:35:01 +00:00 · 2025-05-20 12:17:27 +02:00 · 2025-05-20 12:17:21 +02:00 · 2025-05-20 12:03:02 +02:00 · 2025-05-20 11:42:30 +02:00 · 2025-05-20 11:36:22 +02:00
235 changed files with 11873 additions and 31408 deletions
--- a/.env
+++ b/.env
@ -29,6 +29,9 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 # Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
 # LOCALAI_FORCE_BACKEND_SHUTDOWN=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@ -73,7 +76,7 @@
 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""
 ### Enable to run parallel requests
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/autogptq"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@ -9,10 +9,10 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggerganov/llama.cpp"
+          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
-          - repository: "ggerganov/whisper.cpp"
+          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
          - repository: "PABannier/bark.cpp"
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.3.0
+        uses: dependabot/fetch-metadata@v2.4.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -42,7 +42,7 @@ jobs:
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
-        uses: appleboy/scp-action@v0.1.7
+        uses: appleboy/scp-action@v1.0.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
+        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@ -33,6 +33,7 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      fail-fast: false
      matrix:
        include:
          # This is basically covered by the AIO test
@ -56,26 +57,35 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'hipblas'
+          - build-type: 'hipblas'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   tag-suffix: '-hipblas'
+            tag-suffix: '-hipblas'
-          #   ffmpeg: 'false'
+            ffmpeg: 'false'
-          #   image-type: 'extras'
+            image-type: 'extras'
-          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
-          #   grpc-base-image: "ubuntu:22.04"
+            grpc-base-image: "ubuntu:22.04"
-          #   runs-on: 'arc-runner-set'
+            runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'sycl_f16'
+          - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
+            grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg'
+            tag-suffix: 'sycl-f16-ffmpeg'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'extras'
+            image-type: 'extras'
-          #   runs-on: 'arc-runner-set'
+            runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
  # core-image-build:
  #   uses: ./.github/workflows/image_build.yml
  #   with:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -45,13 +45,13 @@ jobs:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
+            tag-suffix: '-hipblas-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
+            latest-image: 'latest-gpu-hipblas-extras'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@ -59,32 +59,13 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
+            latest-image: 'latest-gpu-hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@ -114,110 +95,58 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
+            tag-suffix: '-cublas-cuda11-extras'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
-            latest-image: 'latest-gpu-nvidia-cuda-11'
+            latest-image: 'latest-gpu-nvidia-cuda-11-extras'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
+            tag-suffix: '-cublas-cuda12-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-12"
-            latest-image: 'latest-gpu-nvidia-cuda-12'
+            latest-image: 'latest-gpu-nvidia-cuda-12-extras'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-ffmpeg'
+            tag-suffix: '-sycl-f16-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f16"
-            latest-image: 'latest-gpu-intel-f16'
+            latest-image: 'latest-gpu-intel-f16-extras'
            latest-image-aio: 'latest-aio-gpu-intel-f16'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-ffmpeg'
+            tag-suffix: '-sycl-f32-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f32"
-            latest-image: 'latest-gpu-intel-f32'
+            latest-image: 'latest-gpu-intel-f32-extras'
            latest-image-aio: 'latest-aio-gpu-intel-f32'
            makeflags: "--jobs=3 --output-sync=target"
          # Core images
@ -226,41 +155,23 @@ jobs:
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-core'
+            tag-suffix: '-sycl-f16'
-            ffmpeg: 'false'
+            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f16'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-core'
+            tag-suffix: '-sycl-f32'
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f32'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@ -293,7 +204,7 @@ jobs:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: '-ffmpeg-core'
+            tag-suffix: ''
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
@ -308,60 +219,38 @@ jobs:
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-core'
+            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            tag-suffix: '-cublas-cuda12'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan-ffmpeg-core'
+            tag-suffix: '-vulkan'
            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-vulkan'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
@ -394,8 +283,8 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'false'
-            tag-suffix: '-nvidia-l4t-arm64-core'
+            tag-suffix: '-nvidia-l4t-arm64'
-            latest-image: 'latest-nvidia-l4t-arm64-core'
+            latest-image: 'latest-nvidia-l4t-arm64'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@ -310,6 +310,11 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
      - name: Cleanup
        run: |
          docker builder prune -f
          docker system prune --force --volumes --all
      - name: Latest tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -36,6 +36,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
          make install-go-tools
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@ -123,7 +124,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -151,6 +152,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          make install-go-tools
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@ -232,7 +234,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -253,8 +255,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          make install-go-tools
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
@ -275,7 +276,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -295,8 +296,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          make install-go-tools
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
        id: build
        run: |
@ -317,7 +317,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@v2.22.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@ -78,6 +78,26 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
  #tests-vllm:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
  #      uses: actions/checkout@v4
  #      with:
  #        submodules: true
  #    - name: Dependencies
  #      run: |
  #        sudo apt-get update
  #        sudo apt-get install -y build-essential ffmpeg
  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #        sudo apt-get install -y libopencv-dev
  #        # Install UV
  #        curl -LsSf https://astral.sh/uv/install.sh | sh
  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
  #    - name: Test vllm backend
  #      run: |
  #        make --jobs=5 --output-sync=target -C backend/python/vllm
  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -71,7 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev
+          sudo apt-get install -y libgmock-dev clang
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@ -96,6 +96,7 @@ jobs:
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@ -130,7 +131,7 @@ jobs:
          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -183,6 +184,7 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
@ -194,7 +196,7 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -222,6 +224,7 @@ jobs:
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
          pip install --user --no-cache-dir grpcio-tools
          go install github.com/GeertJohan/go.rice/rice@latest
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@ -232,7 +235,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/18
+++ b/18
@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@ -24,6 +24,7 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@ -45,9 +46,10 @@ EOT
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
-# Install grpc compilers
+# Install grpc compilers and rice
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
    go install github.com/GeertJohan/go.rice/rice@latest
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@ -299,10 +301,9 @@ COPY .git .
 RUN make prepare
 ## Build the binary
-## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
+## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
+## Otherwise just run the normal build
-## (both will use CUDA or hipblas for the actual computation)
+RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@ -430,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/autogptq \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/102
+++ b/102
@ -6,11 +6,11 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
+CPPLLAMA_VERSION?=6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c
 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
+WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
+WHISPER_CPP_VERSION?=d1f114da61b1ae1e70b03104fad42c9dd666feeb
 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
@ -21,8 +21,11 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
 # ONEAPI variables for SYCL
 export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@ -30,8 +33,12 @@ ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=
+export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export BACKEND_LIBS?=
 export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
 export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
 export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
@ -81,6 +88,7 @@ endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Detect if we are running on arm64
@ -108,13 +116,31 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
 		GO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 	ifeq ($(BUILD_TYPE),metal)
 #			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
 		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	else
 		CGO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
@ -126,21 +152,29 @@ ifeq ($(BUILD_TYPE),openblas)
 endif
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
 	export GGML_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
+	CMAKE_ARGS+=-DGGML_CUDA=ON
 	WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
 	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
 endif
 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
 	WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
 	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
 endif
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
 	CMAKE_ARGS+=-DGGML_SYCL=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
 	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@ -151,7 +185,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
-	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
+	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@ -260,11 +294,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion-ggml
 endif
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@ -290,8 +320,9 @@ sources/whisper.cpp:
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
+sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
+	cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
 	cd sources/whisper.cpp/build && cmake --build . --config Release
 get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
@ -341,8 +372,14 @@ clean-tests:
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Install Go tools
 install-go-tools:
 	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
 	go install github.com/GeertJohan/go.rice/rice@latest
 ## Build:
-build: prepare backend-assets grpcs ## Build the project
+build: prepare backend-assets grpcs install-go-tools ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@ -352,7 +389,9 @@ ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
 endif
 	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 	rice append --exec $(BINARY_NAME)
 build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@ -424,6 +463,7 @@ prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models
 ## Test targets
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts debug"
@ -498,7 +538,7 @@ protogen: protogen-go protogen-python
 protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
-protogen-go:
+protogen-go: install-go-tools
 	mkdir -p pkg/grpc/proto
 	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
@ -509,18 +549,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
 	$(MAKE) -C backend/python/autogptq protogen
 .PHONY: autogptq-protogen-clean
 autogptq-protogen-clean:
 	$(MAKE) -C backend/python/autogptq protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
@ -597,7 +629,6 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@ -611,10 +642,12 @@ prepare-extra-conda-environments: protogen-python
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/vllm
 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
 	$(MAKE) -C backend/python/vllm test
 backend-assets:
 	mkdir -p backend-assets
@ -756,8 +789,8 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/silero-vad
 endif
-backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
@ -809,7 +842,8 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--progress plain \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -817,7 +851,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
+  <img height="300" src="./core/http/static/logo.png"> <br>
    LocalAI
 <br>
 </h1>
@ -31,7 +30,7 @@
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@ -44,35 +43,154 @@
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
-![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
 ## 📚🆕 Local Stack Family
 🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
 <table>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalAGI">
        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
    </td>
  </tr>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalRecall">
        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
    </td>
  </tr>
 </table>
 ## Screenshots
 | Talk Interface | Generate Audio |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
 | Models Overview | Generate Images |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
 | Chat Interface | Home |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
 | Login | Swarm |
 | --- | --- |
 |![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
 ## 💻 Quickstart
 Run the installer script:
 ```bash
 # Basic installation
 curl https://localai.io/install.sh | sh
 ```
-Or run with docker:
+For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
 ```bash
 # CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-# Nvidia GPU:
+Or run with docker:
 ### CPU only image:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ```
 ### NVIDIA GPU Images:
 ```bash
 # CUDA 12.0 with core features
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-# CPU and GPU image (bigger size):
+# CUDA 12.0 with extra Python dependencies
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12-extras
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+# CUDA 11.7 with core features
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
 # CUDA 11.7 with extra Python dependencies
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11-extras
 # NVIDIA Jetson (L4T) ARM64
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
 ```
 ### AMD GPU Images (ROCm):
 ```bash
 # ROCm with core features
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
 # ROCm with extra Python dependencies
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas-extras
 ```
 ### Intel GPU Images (oneAPI):
 ```bash
 # Intel GPU with FP16 support
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
 # Intel GPU with FP16 support and extra dependencies
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16-extras
 # Intel GPU with FP32 support
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
 # Intel GPU with FP32 support and extra dependencies
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32-extras
 ```
 ### Vulkan GPU Images:
 ```bash
 # Vulkan with core features
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
 ```
 ### AIO Images (pre-downloaded models):
 ```bash
 # CPU version
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 # NVIDIA CUDA 12 version
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
 # NVIDIA CUDA 11 version
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
 # Intel GPU version
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
 # AMD GPU version
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
 ```
 For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
 To load models:
 ```bash
@ -88,10 +206,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```
-[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ## 📰 Latest project news
 - Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
 - Apr 2025: WebUI overhaul, AIO images updates
 - Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@ -105,19 +226,6 @@ local-ai run oci://localai/phi-2:latest
 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@ -131,12 +239,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
 ### 🔗 Community and integrations
@ -212,7 +318,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@ -1,7 +1,7 @@
 name: text-embedding-ada-002
 embeddings: true
 name: text-embedding-ada-002
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@ -1,101 +1,57 @@
 name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
-
+f16: true
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
-    # This allows the grammar to also return messages
+    no_mixed_free_string: true
-    mixed_mode: true
+    schema_type: llama3.1 # or JSON is supported too (json)
-    # Suffix to add to the grammar
+  response_regex:
-    #prefix: '<tool_call>\n'
+  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
-    # Force parallel calls in the grammar
+mmap: true
-    # parallel_calls: true
+name: gpt-4
-
+parameters:
-  return_name_in_function_response: true
+  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-  # Without grammar uncomment the lines below
+stopwords:
-  # Warning: this is relying only on the capability of the
+- <|im_end|>
-  # LLM model to generate the correct function call.
+- <dummy32000>
-  json_regex_match: 
+- <|eot_id|>
-   - "(?s)<tool_call>(.*?)</tool_call>"
+- <|end_of_text|>
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
-    {{.Input -}}
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-    <|im_start|>assistant
+    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
    {{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    {{ else if eq .RoleName "tool" -}}
-    {{- else if eq .RoleName "tool" }}
+    The Function was executed and the response was:
-    <tool_response>
+    {{ end -}}
-    {{- end }}
+    {{ if .Content -}}
-    {{- if .Content}}
+    {{.Content -}}
-    {{.Content }}
+    {{ else if .FunctionCall -}}
-    {{- end }}
+    {{ range .FunctionCall }}
-    {{- if .FunctionCall}}
+    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
-    {{toJson .FunctionCall}}
+    {{ end }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall }}
+    <|eot_id|>
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
-    <|im_start|>system
+    <|start_header_id|>system<|end_header_id|>
-    You are a function calling AI model.
+    You are an expert in composing functions. You are given a question and a set of possible functions.
-    Here are the available tools:
+    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
-    <tools>
+    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
-    {{range .Functions}}
+    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
-    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
-    {{end}}
+    You SHOULD NOT include any other text in the response.
-    </tools>
+    Here is a list of functions in JSON format that you can invoke.
-    You should call the tools provided to you sequentially
+    {{toJson .Functions}}
-    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
-    <scratchpad>
+    {{.Input}}
-    {step-by-step reasoning and plan in bullet points}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
-    </scratchpad>
+
-    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+download_files:
-    <tool_call>
+- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
-    {"arguments": <args-dict>, "name": <function-name>}
+  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
-    </tool_call><|im_end|>
+  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
    {{.Input -}}
    <|im_start|>assistant
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@ -1,31 +1,49 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
-  model: bakllava.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-
+stopwords:
 - <|im_end|>
 - <dummy32000>
 - </s>
 - <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: bakllava.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: bakllava-mmproj.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@ -1,101 +1,53 @@
-name: gpt-4
+context_size: 4096
-mmap: true
+f16: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
-  # disable injecting the "answer" tool
+  capture_llm_results:
-  disable_no_action: true
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
+    properties_order: name,arguments
-    mixed_mode: true
+  json_regex_match:
-    # Suffix to add to the grammar
+  - (?s)<Output>(.*?)</Output>
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
-  # Drop the scratchpad content from responses
+  - key: (?s)<Thought>(.*?)</Thought>
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
-  replace_function_results: 
+mmap: true
-  # Replace everything that is not JSON array or object
+name: gpt-4
-  # 
+parameters:
-  - key: '(?s)^[^{\[]*'
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
-    value: ""
+stopwords:
-  - key: '(?s)[^}\]]*$'
+- <|im_end|>
-    value: ""
+- <dummy32000>
-  - key: "'([^']*?)'"
+- </s>
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|im_start|>{{ .RoleName }}
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    Function call:
-    {{- else if eq .RoleName "tool" }}
+    {{ else if eq .RoleName "tool" -}}
-    <tool_response>
+    Function response:
-    {{- end }}
+    {{ end -}}
-    {{- if .Content}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
+    {{ end -}}<|im_end|>
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
+    <|im_end|>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@ -1,35 +1,49 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-  temperature: 0.2
+stopwords:
-  top_k: 40
+- <|im_end|>
-  top_p: 0.95
+- <dummy32000>
-  seed: -1
+- </s>
-
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@ -1,103 +1,53 @@
-name: gpt-4
+context_size: 4096
-mmap: false
+f16: true
 context_size: 8192
 f16: false
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 stopwords:
 - "<|im_end|>"
 - "<dummy32000>"
 - "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
-  # disable injecting the "answer" tool
+  capture_llm_results:
-  disable_no_action: true
+  - (?s)<Thought>(.*?)</Thought>
  grammar:
-    # This allows the grammar to also return messages
+    properties_order: name,arguments
-    mixed_mode: true
+  json_regex_match:
-    # Suffix to add to the grammar
+  - (?s)<Output>(.*?)</Output>
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
-  # Drop the scratchpad content from responses
+  - key: (?s)<Thought>(.*?)</Thought>
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
-  replace_function_results: 
+mmap: true
-  # Replace everything that is not JSON array or object
+name: gpt-4
-  # 
+parameters:
-  - key: '(?s)^[^{\[]*'
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
-    value: ""
+stopwords:
-  - key: '(?s)[^}\]]*$'
+- <|im_end|>
-    value: ""
+- <dummy32000>
-  - key: "'([^']*?)'"
+- </s>
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    <|im_start|>{{ .RoleName }}
-    {{- if .FunctionCall }}
+    {{ if .FunctionCall -}}
-    <tool_call>
+    Function call:
-    {{- else if eq .RoleName "tool" }}
+    {{ else if eq .RoleName "tool" -}}
-    <tool_response>
+    Function response:
-    {{- end }}
+    {{ end -}}
-    {{- if .Content}}
+    {{ if .Content -}}
    {{.Content }}
-    {{- end }}
+    {{ end -}}
-    {{- if .FunctionCall}}
+    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
-    {{- end }}
+    {{ end -}}<|im_end|>
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |-
+  function: |
    <|im_start|>system
-    You are a function calling AI model.
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    </tools>
+    <|im_end|>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@ -1,35 +1,50 @@
 backend: llama-cpp
 context_size: 4096
-mmap: false
+f16: true
-f16: false
+mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  model: minicpm-v-2_6-Q4_K_M.gguf
-  temperature: 0.2
+stopwords:
-  top_k: 40
+- <|im_end|>
-  top_p: 0.95
+- <dummy32000>
-  seed: -1
+- </s>
-
+- <|endoftext|>
 template:
  chat: |
-    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-    ASSISTANT:
+  function: |
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: minicpm-v-2_6-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
-  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+- filename: minicpm-v-2_6-mmproj-f16.gguf
-
+  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
-usage: |
+  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/assets.go
+++ b/assets.go
@ -1,6 +1,15 @@
 package main
-import "embed"
+import (
 	rice "github.com/GeertJohan/go.rice"
 )
-//go:embed backend-assets/*
+var backendAssets *rice.Box
-var backendAssets embed.FS
+
 func init() {
 	var err error
 	backendAssets, err = rice.FindBox("backend-assets")
 	if err != nil {
 		panic(err)
 	}
 }
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -14,6 +14,7 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@ -165,7 +166,6 @@ message Reply {
 message GrammarTrigger {
  string word = 1;
  bool at_start = 2; 
 }
 message ModelOptions {
@ -191,11 +191,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;
-  // AutoGPTQ
+
  string Device = 22;
  bool UseTriton = 23;
  string ModelBaseName = 24;
  bool UseFastTokenizer = 25;
  // Diffusers
  string PipelineType = 26;
@ -229,6 +225,11 @@ message ModelOptions {
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  bool   DisableLogStatus = 66;
  string DType = 67;
  int32  LimitImagePerPrompt = 68;
  int32  LimitVideoPerPrompt = 69;
  int32  LimitAudioPerPrompt = 70;
  string MMProj = 41;
@ -301,6 +302,19 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }
 message GenerateVideoRequest {
  string prompt = 1;
  string start_image = 2;  // Path or base64 encoded image for the start frame
  string end_image = 3;    // Path or base64 encoded image for the end frame
  int32 width = 4;
  int32 height = 5;
  int32 num_frames = 6;    // Number of frames to generate
  int32 fps = 7;          // Frames per second
  int32 seed = 8;
  float cfg_scale = 9;    // Classifier-free guidance scale
  string dst = 10;        // Output path for the generated video
 }
 message TTSRequest {
  string text = 1;
  string model = 2;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -1,17 +1,17 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
-set(TARGET myclip)
+# set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
+# install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
+# target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
+# target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
+# target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
+# if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+#     target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-endif()
+# endif()
 # END CLIP hack
@ -74,8 +74,12 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )
-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl" \
 		-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 llama.cpp:
@ -52,8 +59,8 @@ llama.cpp:
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-llama.cpp/examples/grpc-server: llama.cpp
+llama.cpp/tools/grpc-server: llama.cpp
-	mkdir -p llama.cpp/examples/grpc-server
+	mkdir -p llama.cpp/tools/grpc-server
 	bash prepare.sh
 rebuild:
@ -63,13 +70,13 @@ rebuild:
 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/examples/grpc-server
+	rm -rf llama.cpp/tools/grpc-server
 	rm -rf grpc-server
 clean: purge
 	rm -rf llama.cpp
-grpc-server: llama.cpp llama.cpp/examples/grpc-server
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@ -1,7 +1,7 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index 3cd0d2fa..6c5e811a 100644
--- a/examples/llava/clip.cpp
+--- a/tools/mtmd/clip.cpp
-+++ b/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -7,21 +7,46 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
-cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
+set -e
-cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
+
-cp -rfv json.hpp llama.cpp/examples/grpc-server/
+cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
-cp -rfv utils.hpp llama.cpp/examples/grpc-server/
+cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
-    
+cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
-if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
 set +e
 if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
 set -e
-## XXX: In some versions of CMake clip wasn't being built before llama.
+# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
-## This is an hack for now, but it should be fixed in the future.
+# and remove the main function
-cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
-cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+awk '
-echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+/int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
-cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+    in_main=1;                     # Set a flag
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+    open_braces=0;                 # Track number of open braces
 }
 in_main {
    open_braces += gsub(/\{/, "{"); # Count opening braces
    open_braces -= gsub(/\}/, "}"); # Count closing braces
    if (open_braces == 0) {         # If all braces are closed
        in_main=0;                  # End skipping
    }
    next;                           # Skip lines inside main
 }
 !in_main                           # Print lines not inside main
 ' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
 # remove index.html.gz.hpp and loading.html.hpp includes
 if [[ "$OSTYPE" == "darwin"* ]]; then
    # macOS
    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
 else
    # Linux and others
    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
 fi
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@ -1,483 +0,0 @@
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 //
 // parallel
 //
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
    IDLE,
    PROCESSING,
 };
 enum slot_command
 {
    NONE,
    LOAD_PROMPT,
    RELEASE,
 };
 struct slot_params
 {
    bool stream       = true;
    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t  n_predict = -1; // new tokens to predict
    std::vector<std::string> antiprompt;
    json input_prefix;
    json input_suffix;
 };
 struct slot_image
 {
    int32_t id;
    bool request_encode_image = false;
    float * image_embedding = nullptr;
    int32_t image_tokens = 0;
    clip_image_u8 * img_data;
    std::string prefix_prompt; // before of this image
 };
 // completion token output with probabilities
 struct completion_token_output
 {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 static inline void server_log(const char *level, const char *function, int line,
                       const char *message, const nlohmann::ordered_json &extra)
 {
    nlohmann::ordered_json log
    {
        {"timestamp", time(nullptr)},
        {"level",     level},
        {"function",  function},
        {"line",      line},
        {"message",   message},
    };
    if (!extra.empty())
    {
        log.merge_patch(extra);
    }
    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
    printf("%.*s\n", (int)str.size(), str.data());
    fflush(stdout);
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 inline std::string format_chatml(std::vector<json> messages)
 {
    std::ostringstream chatml_msgs;
    for (auto it = messages.begin(); it != messages.end(); ++it) {
        chatml_msgs << "<|im_start|>"
                    << json_value(*it, "role",    std::string("user")) << '\n';
        chatml_msgs << json_value(*it, "content", std::string(""))
                    << "<|im_end|>\n";
    }
    chatml_msgs << "<|im_start|>assistant" << '\n';
    return chatml_msgs.str();
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_all_task_finished;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        return id++;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when the batch of tasks is finished
    void on_all_tasks_finished(std::function<void(void)> callback) {
        callback_all_task_finished = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // Start the main loop. This call is blocking
    [[noreturn]]
    void start_loop() {
        while (true) {
            // new task arrived
            LOG_VERBOSE("have new task", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {});
                    callback_new_task(task);
                }
                LOG_VERBOSE("callback_all_task_finished", {});
                // process and update all the multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is finished
                callback_all_task_finished();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    condition_tasks.wait(lock, [&]{
                        return !queue_tasks.empty();
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    void add_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    void remove_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            LOG_VERBOSE("condition_results unblock", {});
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {});
                queue_results.push_back(result);
                condition_results.notify_one();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@ -48,7 +48,7 @@ int tts(char *text,int  threads, char *dst ) {
    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
-        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@ -8,12 +8,19 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 GOCMD?=go
 CGO_LDFLAGS?=
 # Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
 CGO_LDFLAGS_SYCL=
 GO_TAGS?=
 LD_FLAGS?=
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CMAKE_ARGS+=-DSD_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
@ -21,31 +28,50 @@ else ifeq ($(BUILD_TYPE),openblas)
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DSD_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
+		CMAKE_ARGS+=-DSD_METAL=OFF
 	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DSD_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
-# ifeq ($(BUILD_TYPE),sycl_f16)
+ifeq ($(BUILD_TYPE),sycl_f16)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
-# endif
+		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON \
 		-DGGML_SYCL_F16=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
-# ifeq ($(BUILD_TYPE),sycl_f32)
+ifeq ($(BUILD_TYPE),sycl_f32)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
-# endif
+		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
 # warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@ -86,11 +112,24 @@ endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
 else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 endif
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 stablediffusion-ggml:
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
 	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
 ifneq ($(UPX),)
 	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
 endif
 clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
+	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@ -35,6 +35,8 @@ const char* sample_method_str[] = {
    "ipndm",
    "ipndm_v",
    "lcm",
    "ddim_trailing",
    "tcd",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@ -173,6 +175,7 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
 			    0, // eta
                            width,
                            height,
                            sample_method, 
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
 		context.SetTranslate(true)
 	}
-	if err := context.Process(data, nil, nil); err != nil {
+	if err := context.Process(data, nil, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@ -1,17 +0,0 @@
 .PHONY: autogptq
 autogptq: protogen
 	bash install.sh
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@ -1,5 +0,0 @@
 # Creating a separate environment for the autogptq project
 ```
 make autogptq
 ```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@ -1,153 +0,0 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import base64
 import grpc
 import backend_pb2
 import backend_pb2_grpc
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextGenerationPipeline
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            device = "cuda:0"
            if request.Device != "":
                device = request.Device
            # support loading local model files
            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
            # support model `Qwen/Qwen-VL-Chat-Int4`
            if "qwen-vl" in request.Model.lower():
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
            self.model = model
            self.tokenizer = tokenizer
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.0
        if request.Penalty != 0.0:
            penalty = request.Penalty
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        top_p = 0.95
        if request.TopP != 0.0:
            top_p = request.TopP
        prompt_images = self.recompile_vl_prompt(request)
        compiled_prompt = prompt_images[0]
        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
        # Implement Predict RPC
        pipeline = TextGenerationPipeline(
            model=self.model, 
            tokenizer=self.tokenizer,
            max_new_tokens=tokens,
            temperature=request.Temperature,
            top_p=top_p,
            repetition_penalty=penalty,
            )
        t = pipeline(compiled_prompt)[0]["generated_text"]
        print(f"generated_text: {t}", file=sys.stderr)
        if compiled_prompt in t:
            t = t.replace(compiled_prompt, "")
        # house keeping. Remove the image files from /tmp folder
        for img_path in prompt_images[1]:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
    def recompile_vl_prompt(self, request):
        prompt = request.Prompt
        image_paths = []
        if "qwen-vl" in self.model_name.lower():
            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
            # Then, save the image file paths to an array "image_paths".
            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
            for i, img in enumerate(request.Images):
                timestamp = str(int(time.time() * 1000))  # Generate timestamp
                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
                with open(img_path, "wb") as f:
                    f.write(base64.b64decode(img))
                image_paths.append(img_path)
                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
        else:
            prompt = request.Prompt
        return (prompt, image_paths)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@ -1,14 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 # This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
 # This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
 # We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
 # the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@ -1,2 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@ -1 +0,0 @@
 torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@ -1,2 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@ -1,6 +0,0 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@ -1,6 +0,0 @@
 accelerate
 auto-gptq==0.7.1
 grpcio==1.70.0
 protobuf
 certifi
 transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@ -1,4 +0,0 @@
 #!/bin/bash
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@ -1,4 +1,4 @@
-transformers
+transformers==4.48.3
 accelerate
 torch==2.4.1
 coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@ -1,5 +1,5 @@
 torch==2.4.1
 torchaudio==2.4.1
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
 torchaudio==2.4.1+rocm6.0
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
-transformers
+transformers==4.48.3
 accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@ -19,7 +19,7 @@ import grpc
 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@ -168,9 +168,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # We are storing all the options in a dict so we can use it later when
            # generating the images
            for opt in options:
                if ":" not in opt:
                    continue
                key, value = opt.split(":")
                self.options[key] = value
            print(f"Options: {self.options}", file=sys.stderr)
            local = False
            modelFile = request.Model
@ -287,6 +291,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "Lumina2Text2ImgPipeline":
                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
                    request.Model,
                    torch_dtype=torch.bfloat16)
                if request.LowVRAM:
                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@ -516,7 +526,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@ -1,5 +1,5 @@
 setuptools
-grpcio==1.70.0
+grpcio==1.72.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            engine_args.swap_space = request.SwapSpace
        if request.MaxModelLen != 0:
            engine_args.max_model_len = request.MaxModelLen
        if request.DisableLogStatus:
            engine_args.disable_log_status = request.DisableLogStatus
        if request.DType != "":
            engine_args.dtype = request.DType
        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
            engine_args.limit_mm_per_prompt = {
                "image": max(request.LimitImagePerPrompt, 1),
                "video": max(request.LimitVideoPerPrompt, 1),
                "audio": max(request.LimitAudioPerPrompt, 1)
            }
        try:
            self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@ -183,27 +194,40 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            await iterations.aclose()
    async def _predict(self, request, context, streaming=False):
        # Build the sampling parameters
        # NOTE: this must stay in sync with the vllm backend
        request_to_sampling_params = {
            "N": "n",
            "PresencePenalty": "presence_penalty",
            "FrequencyPenalty": "frequency_penalty",
            "RepetitionPenalty": "repetition_penalty",
            "Temperature": "temperature",
            "TopP": "top_p",
            "TopK": "top_k",
            "MinP": "min_p",
            "Seed": "seed",
            "StopPrompts": "stop",
            "StopTokenIds": "stop_token_ids",
            "BadWords": "bad_words",
            "IncludeStopStrInOutput": "include_stop_str_in_output",
            "IgnoreEOS": "ignore_eos",
            "Tokens": "max_tokens",
            "MinTokens": "min_tokens",
            "Logprobs": "logprobs",
            "PromptLogprobs": "prompt_logprobs",
            "SkipSpecialTokens": "skip_special_tokens",
            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
            "TruncatePromptTokens": "truncate_prompt_tokens",
            "GuidedDecoding": "guided_decoding",
        }
        # Build sampling parameters
        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
-        if request.TopP != 0:
+
-            sampling_params.top_p = request.TopP
+        for request_field, param_field in request_to_sampling_params.items():
-        if request.Tokens > 0:
+            if hasattr(request, request_field):
-            sampling_params.max_tokens = request.Tokens
+                value = getattr(request, request_field)
-        if request.Temperature != 0:
+                if value not in (None, 0, [], False):
-            sampling_params.temperature = request.Temperature
+                    setattr(sampling_params, param_field, value)
        if request.TopK != 0:
            sampling_params.top_k = request.TopK
        if request.PresencePenalty != 0:
            sampling_params.presence_penalty = request.PresencePenalty
        if request.FrequencyPenalty != 0:
            sampling_params.frequency_penalty = request.FrequencyPenalty
        if request.StopPrompts:
            sampling_params.stop = request.StopPrompts
        if request.IgnoreEOS:
            sampling_params.ignore_eos = request.IgnoreEOS
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        # Extract image paths and process images
        prompt = request.Prompt
@ -269,7 +293,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def load_image(self, image_path: str):
        """
        Load an image from the given file path or base64 encoded data.
-        
+
        Args:
            image_path (str): The path to the image file or base64 encoded data.
@ -288,7 +312,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
    def load_video(self, video_path: str):
        """
        Load a video from the given file path.
-        
+
        Args:
            video_path (str): The path to the image file.
@ -309,7 +333,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
@ -335,4 +364,4 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()
-    asyncio.run(serve(args.addr))
+    asyncio.run(serve(args.addr))
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.72.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@ -75,6 +75,53 @@ class TestBackendServicer(unittest.TestCase):
        finally:
            self.tearDown()
    def test_sampling_params(self):
        """
        This method tests if all sampling parameters are correctly processed
        NOTE: this does NOT test for correctness, just that we received a compatible response
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
                self.assertTrue(response.success)
                req = backend_pb2.PredictOptions(
                    Prompt="The capital of France is",
                    TopP=0.8,
                    Tokens=50,
                    Temperature=0.7,
                    TopK=40,
                    PresencePenalty=0.1,
                    FrequencyPenalty=0.2,
                    RepetitionPenalty=1.1,
                    MinP=0.05,
                    Seed=42,
                    StopPrompts=["\n"],
                    StopTokenIds=[50256],
                    BadWords=["badword"],
                    IncludeStopStrInOutput=True,
                    IgnoreEOS=True,
                    MinTokens=5,
                    Logprobs=5,
                    PromptLogprobs=5,
                    SkipSpecialTokens=True,
                    SpacesBetweenSpecialTokens=True,
                    TruncatePromptTokens=10,
                    GuidedDecoding=True,
                    N=2,
                )
                resp = stub.Predict(req)
                self.assertIsNotNone(resp.message)
                self.assertIsNotNone(resp.logprobs)
        except Exception as err:
            print(err)
            self.fail("sampling params service failed")
        finally:
            self.tearDown()
    def test_embedding(self):
        """
        This method tests if the embeddings are generated successfully
--- a/core/application/application.go
+++ b/core/application/application.go
@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -43,18 +43,12 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
-	if options.ImageDir != "" {
+	if options.GeneratedContentDir != "" {
-		err := os.MkdirAll(options.ImageDir, 0750)
+		err := os.MkdirAll(options.GeneratedContentDir, 0750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.AudioDir != "" {
 		err := os.MkdirAll(options.AudioDir, 0750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
 		err := os.MkdirAll(options.UploadDir, 0750)
 		if err != nil {
@ -143,7 +137,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}
-	if options.LoadToMemory != nil {
+	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
@ -116,6 +117,11 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		}
 		if tokenCallback != nil {
 			if c.TemplateConfig.ReplyPrefix != "" {
 				tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
 			}
 			ss := ""
 			var partialRune []byte
@ -165,8 +171,13 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
 			tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
 			response := string(reply.Message)
 			if c.TemplateConfig.ReplyPrefix != "" {
 				response = c.TemplateConfig.ReplyPrefix + response
 			}
 			return LLMResponse{
-				Response: string(reply.Message),
+				Response: response,
 				Usage:    tokenUsage,
 			}, err
 		}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
 	if so.SingleBackend {
 		defOpts = append(defOpts, model.WithSingleActiveBackend())
 	}
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@ -103,7 +99,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		mmap = *c.MMap
 	}
-	ctxSize := 1024
+	ctxSize := 4096
 	if c.ContextSize != nil {
 		ctxSize = *c.ContextSize
 	}
@ -121,8 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word:    t.Word,
+			Word: t.Word,
 			AtStart: t.AtStart,
 		})
 	}
@ -159,35 +154,36 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
-		MMProj:               c.MMProj,
+		DisableLogStatus:     c.DisableLogStatus,
-		FlashAttention:       c.FlashAttention,
+		DType:                c.DType,
-		CacheTypeKey:         c.CacheTypeK,
+		// LimitMMPerPrompt vLLM
-		CacheTypeValue:       c.CacheTypeV,
+		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		NoKVOffload:          c.NoKVOffloading,
+		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		YarnExtFactor:        c.YarnExtFactor,
+		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		YarnAttnFactor:       c.YarnAttnFactor,
+		MMProj:              c.MMProj,
-		YarnBetaFast:         c.YarnBetaFast,
+		FlashAttention:      c.FlashAttention,
-		YarnBetaSlow:         c.YarnBetaSlow,
+		CacheTypeKey:        c.CacheTypeK,
-		NGQA:                 c.NGQA,
+		CacheTypeValue:      c.CacheTypeV,
-		RMSNormEps:           c.RMSNormEps,
+		NoKVOffload:         c.NoKVOffloading,
-		MLock:                mmlock,
+		YarnExtFactor:       c.YarnExtFactor,
-		RopeFreqBase:         c.RopeFreqBase,
+		YarnAttnFactor:      c.YarnAttnFactor,
-		RopeScaling:          c.RopeScaling,
+		YarnBetaFast:        c.YarnBetaFast,
-		Type:                 c.ModelType,
+		YarnBetaSlow:        c.YarnBetaSlow,
-		RopeFreqScale:        c.RopeFreqScale,
+		NGQA:                c.NGQA,
-		NUMA:                 c.NUMA,
+		RMSNormEps:          c.RMSNormEps,
-		Embeddings:           embeddings,
+		MLock:               mmlock,
-		LowVRAM:              lowVRAM,
+		RopeFreqBase:        c.RopeFreqBase,
-		NGPULayers:           int32(nGPULayers),
+		RopeScaling:         c.RopeScaling,
-		MMap:                 mmap,
+		Type:                c.ModelType,
-		MainGPU:              c.MainGPU,
+		RopeFreqScale:       c.RopeFreqScale,
-		Threads:              int32(*c.Threads),
+		NUMA:                c.NUMA,
-		TensorSplit:          c.TensorSplit,
+		Embeddings:          embeddings,
-		// AutoGPTQ
+		LowVRAM:             lowVRAM,
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		NGPULayers:          int32(nGPULayers),
-		Device:           c.AutoGPTQ.Device,
+		MMap:                mmap,
-		UseTriton:        c.AutoGPTQ.Triton,
+		MainGPU:             c.MainGPU,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		Threads:             int32(*c.Threads),
 		TensorSplit:         c.TensorSplit,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@ -26,21 +26,26 @@ func SoundGeneration(
 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	defer loader.Close()
 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
 	}
-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+	if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
+	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
-	filePath := filepath.Join(appConfig.AudioDir, fileName)
+	if err := os.MkdirAll(audioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
 	fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
 	filePath := filepath.Join(audioDir, fileName)
 	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
 		Text:        text,
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@ -20,6 +20,7 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	defer loader.Close()
 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
 	defer ml.Close()
 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@ -23,21 +23,22 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	defer loader.Close()
 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
 	}
-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
 	if err := os.MkdirAll(audioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
+	fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
-	filePath := filepath.Join(appConfig.AudioDir, fileName)
+	filePath := filepath.Join(audioDir, fileName)
 	// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
 	// This should be addressed in a follow up PR soon.
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
 	defer ml.Close()
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/backend/video.go
+++ b/core/backend/video.go
@ -0,0 +1,36 @@
 package backend
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err := loader.Load(
 		opts...,
 	)
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	fn := func() error {
 		_, err := inferenceModel.GenerateVideo(
 			appConfig.Context,
 			&proto.GenerateVideoRequest{
 				Height:     height,
 				Width:      width,
 				Prompt:     prompt,
 				StartImage: startImage,
 				EndImage:   endImage,
 				Dst:        dst,
 			})
 		return err
 	}
 	return fn, nil
 }
--- a/core/cli/context/context.go
+++ b/core/cli/context/context.go
@ -1,11 +1,13 @@
 package cliContext
-import "embed"
+import (
 	rice "github.com/GeertJohan/go.rice"
 )
 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
 	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
-	BackendAssets embed.FS `kong:"-"`
+	BackendAssets *rice.Box `kong:"-"`
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -21,8 +21,7 @@ type RunCMD struct {
 	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath            string        `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ImagePath                    string        `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
+	GeneratedContentPath         string        `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"`
 	AudioPath                    string        `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
 	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
 	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
 	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
@ -38,7 +37,7 @@ type RunCMD struct {
 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
@ -47,7 +46,7 @@ type RunCMD struct {
 	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
 	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
 	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"`
 	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
 	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
 	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
@ -81,8 +80,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelPath(r.ModelsPath),
 		config.WithContextSize(r.ContextSize),
 		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
-		config.WithImageDir(r.ImagePath),
+		config.WithGeneratedContentDir(r.GeneratedContentPath),
 		config.WithAudioDir(r.AudioPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
 		config.WithDynamicConfigDir(r.LocalaiConfigDir),
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@ -70,11 +70,11 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	opts := &config.ApplicationConfig{
 		ModelPath:            t.ModelsPath,
 		Context:              context.Background(),
-		AudioDir:             outputDir,
+		GeneratedContentDir:  outputDir,
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}
 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@ -36,12 +36,12 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	text := strings.Join(t.Text, " ")
 	opts := &config.ApplicationConfig{
-		ModelPath:         t.ModelsPath,
+		ModelPath:           t.ModelsPath,
-		Context:           context.Background(),
+		Context:             context.Background(),
-		AudioDir:          outputDir,
+		GeneratedContentDir: outputDir,
-		AssetsDestination: t.BackendAssetsPath,
+		AssetsDestination:   t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath)
+	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/util.go
+++ b/core/cli/util.go
@ -7,11 +7,11 @@ import (
 	"github.com/rs/zerolog/log"
 	gguf "github.com/gpustack/gguf-parser-go"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	gguf "github.com/thxcode/gguf-parser-go"
 )
 type UtilCMD struct {
@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	log.Info().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
+		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
 	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@ -2,11 +2,11 @@ package config
 import (
 	"context"
 	"embed"
 	"encoding/json"
 	"regexp"
 	"time"
 	rice "github.com/GeertJohan/go.rice"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )
@ -19,20 +19,21 @@ type ApplicationConfig struct {
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug                               bool
-	ImageDir                            string
+	GeneratedContentDir                 string
-	AudioDir                            string
+
-	UploadDir                           string
+	ConfigsDir string
-	ConfigsDir                          string
+	UploadDir  string
-	DynamicConfigsDir                   string
+
-	DynamicConfigsDirPollInterval       time.Duration
+	DynamicConfigsDir             string
-	CORS                                bool
+	DynamicConfigsDirPollInterval time.Duration
-	CSRF                                bool
+	CORS                          bool
-	PreloadJSONModels                   string
+	CSRF                          bool
-	PreloadModelsFromPath               string
+	PreloadJSONModels             string
-	CORSAllowOrigins                    string
+	PreloadModelsFromPath         string
-	ApiKeys                             []string
+	CORSAllowOrigins              string
-	P2PToken                            string
+	ApiKeys                       []string
-	P2PNetworkID                        string
+	P2PToken                      string
 	P2PNetworkID                  string
 	DisableWebUI                       bool
 	EnforcePredownloadScans            bool
@ -46,7 +47,7 @@ type ApplicationConfig struct {
 	Galleries []Gallery
-	BackendAssets     embed.FS
+	BackendAssets     *rice.Box
 	AssetsDestination string
 	ExternalGRPCBackends map[string]string
@ -197,7 +198,7 @@ func WithBackendAssetsOutput(out string) AppOption {
 	}
 }
-func WithBackendAssets(f embed.FS) AppOption {
+func WithBackendAssets(f *rice.Box) AppOption {
 	return func(o *ApplicationConfig) {
 		o.BackendAssets = f
 	}
@ -279,15 +280,9 @@ func WithDebug(debug bool) AppOption {
 	}
 }
-func WithAudioDir(audioDir string) AppOption {
+func WithGeneratedContentDir(generatedContentDir string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.AudioDir = audioDir
+		o.GeneratedContentDir = generatedContentDir
 	}
 }
 func WithImageDir(imageDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ImageDir = imageDir
 	}
 }
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -50,9 +50,6 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`
 	// AutoGPTQ specifics
 	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@ -130,25 +127,28 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`
-	ContextSize          *int      `yaml:"context_size"`
+	ContextSize          *int             `yaml:"context_size"`
-	NUMA                 bool      `yaml:"numa"`
+	NUMA                 bool             `yaml:"numa"`
-	LoraAdapter          string    `yaml:"lora_adapter"`
+	LoraAdapter          string           `yaml:"lora_adapter"`
-	LoraBase             string    `yaml:"lora_base"`
+	LoraBase             string           `yaml:"lora_base"`
-	LoraAdapters         []string  `yaml:"lora_adapters"`
+	LoraAdapters         []string         `yaml:"lora_adapters"`
-	LoraScales           []float32 `yaml:"lora_scales"`
+	LoraScales           []float32        `yaml:"lora_scales"`
-	LoraScale            float32   `yaml:"lora_scale"`
+	LoraScale            float32          `yaml:"lora_scale"`
-	NoMulMatQ            bool      `yaml:"no_mulmatq"`
+	NoMulMatQ            bool             `yaml:"no_mulmatq"`
-	DraftModel           string    `yaml:"draft_model"`
+	DraftModel           string           `yaml:"draft_model"`
-	NDraft               int32     `yaml:"n_draft"`
+	NDraft               int32            `yaml:"n_draft"`
-	Quantization         string    `yaml:"quantization"`
+	Quantization         string           `yaml:"quantization"`
-	LoadFormat           string    `yaml:"load_format"`
+	LoadFormat           string           `yaml:"load_format"`
-	GPUMemoryUtilization float32   `yaml:"gpu_memory_utilization"` // vLLM
+	GPUMemoryUtilization float32          `yaml:"gpu_memory_utilization"` // vLLM
-	TrustRemoteCode      bool      `yaml:"trust_remote_code"`      // vLLM
+	TrustRemoteCode      bool             `yaml:"trust_remote_code"`      // vLLM
-	EnforceEager         bool      `yaml:"enforce_eager"`          // vLLM
+	EnforceEager         bool             `yaml:"enforce_eager"`          // vLLM
-	SwapSpace            int       `yaml:"swap_space"`             // vLLM
+	SwapSpace            int              `yaml:"swap_space"`             // vLLM
-	MaxModelLen          int       `yaml:"max_model_len"`          // vLLM
+	MaxModelLen          int              `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
+	TensorParallelSize   int              `yaml:"tensor_parallel_size"`   // vLLM
-	MMProj               string    `yaml:"mmproj"`
+	DisableLogStatus     bool             `yaml:"disable_log_stats"`      // vLLM
 	DType                string           `yaml:"dtype"`                  // vLLM
 	LimitMMPerPrompt     LimitMMPerPrompt `yaml:"limit_mm_per_prompt"`    // vLLM
 	MMProj               string           `yaml:"mmproj"`
 	FlashAttention bool   `yaml:"flash_attention"`
 	NoKVOffloading bool   `yaml:"no_kv_offloading"`
@ -166,12 +166,11 @@ type LLMConfig struct {
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
-// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
+// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
-type AutoGPTQ struct {
+type LimitMMPerPrompt struct {
-	ModelBaseName    string `yaml:"model_base_name"`
+	LimitImagePerPrompt int `yaml:"image"`
-	Device           string `yaml:"device"`
+	LimitVideoPerPrompt int `yaml:"video"`
-	Triton           bool   `yaml:"triton"`
+	LimitAudioPerPrompt int `yaml:"audio"`
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }
 // TemplateConfig is a struct that holds the configuration of the templating system
@ -203,6 +202,8 @@ type TemplateConfig struct {
 	Multimodal string `yaml:"multimodal"`
 	JinjaTemplate bool `yaml:"jinja_template"`
 	ReplyPrefix string `yaml:"reply_prefix"`
 }
 func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
@ -212,7 +213,15 @@ func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
 		return err
 	}
 	*c = BackendConfig(aux)
 	c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
 	// Make sure the usecases are valid, we rewrite with what we identified
 	c.KnownUsecaseStrings = []string{}
 	for k, usecase := range GetAllBackendConfigUsecases() {
 		if c.HasUsecases(usecase) {
 			c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
 		}
 	}
 	return nil
 }
@ -295,9 +304,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTFZ := 1.0
 	defaultZero := 0
 	// Try to offload all GPU layers (if GPU is found)
 	defaultHigh := 99999999
 	trueV := true
 	falseV := false
@ -357,9 +363,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.NGPULayers == nil {
 		cfg.NGPULayers = &defaultHigh
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
@ -369,16 +372,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}
 	// Value passed by the top level are treated as default (no implicit defaults)
 	// defaults are set by the user
 	if ctx == 0 {
 		ctx = 1024
 	}
 	if cfg.ContextSize == nil {
 		cfg.ContextSize = &ctx
 	}
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@ -400,7 +393,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
-	guessDefaultsFromFile(cfg, lo.modelPath)
+	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
 }
 func (c *BackendConfig) Validate() bool {
@ -437,18 +430,19 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int
 const (
-	FLAG_ANY              BackendConfigUsecases = 0b00000000000
+	FLAG_ANY              BackendConfigUsecases = 0b000000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b00000000001
+	FLAG_CHAT             BackendConfigUsecases = 0b000000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b00000000010
+	FLAG_COMPLETION       BackendConfigUsecases = 0b000000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b00000000100
+	FLAG_EDIT             BackendConfigUsecases = 0b000000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b00000001000
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b00000010000
+	FLAG_RERANK           BackendConfigUsecases = 0b000000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b00000100000
+	FLAG_IMAGE            BackendConfigUsecases = 0b000000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b00001000000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b000001000000
-	FLAG_TTS              BackendConfigUsecases = 0b00010000000
+	FLAG_TTS              BackendConfigUsecases = 0b000010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b000100000000
-	FLAG_TOKENIZE         BackendConfigUsecases = 0b01000000000
+	FLAG_TOKENIZE         BackendConfigUsecases = 0b001000000000
-	FLAG_VAD              BackendConfigUsecases = 0b10000000000
+	FLAG_VAD              BackendConfigUsecases = 0b010000000000
 	FLAG_VIDEO            BackendConfigUsecases = 0b100000000000
 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@ -469,9 +463,14 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
 		"FLAG_VIDEO":            FLAG_VIDEO,
 	}
 }
 func stringToFlag(s string) string {
 	return "FLAG_" + strings.ToUpper(s)
 }
 func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	if len(input) == 0 {
 		return nil
@ -479,7 +478,7 @@ func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
 	result := FLAG_ANY
 	flags := GetAllBackendConfigUsecases()
 	for _, str := range input {
-		flag, exists := flags["FLAG_"+strings.ToUpper(str)]
+		flag, exists := flags[stringToFlag(str)]
 		if exists {
 			result |= flag
 		}
@ -529,6 +528,17 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 			return false
 		}
 	}
 	if (u & FLAG_VIDEO) == FLAG_VIDEO {
 		videoBackends := []string{"diffusers", "stablediffusion"}
 		if !slices.Contains(videoBackends, c.Backend) {
 			return false
 		}
 		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
 			return false
 		}
 	}
 	if (u & FLAG_RERANK) == FLAG_RERANK {
 		if c.Backend != "rerankers" {
@ -541,7 +551,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@ -0,0 +1,296 @@
 package config
 import (
 	"strings"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 	gguf "github.com/gpustack/gguf-parser-go"
 )
 type familyType uint8
 const (
 	Unknown familyType = iota
 	LLaMa3
 	CommandR
 	Phi3
 	ChatML
 	Mistral03
 	Gemma
 	DeepSeek2
 )
 const (
 	defaultContextSize = 1024
 	defaultNGPULayers  = 99999999
 )
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
 	RepeatPenalty  float64
 }
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
 		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
 			Completion:  "{{.Input}}",
 		},
 	},
 	DeepSeek2: {
 		StopWords: []string{"<｜end▁of▁sentence｜>"},
 		TemplateConfig: TemplateConfig{
 			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
 {{ end -}}
 {{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
 {{if eq .RoleName "system" -}}{{.Content}}
 {{end -}}`,
 			Chat: "{{.Input -}}\nAssistant: ",
 		},
 	},
 	LLaMa3: {
 		StopWords: []string{"<|eot_id|>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
 			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
 		},
 	},
 	CommandR: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
 			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
 You are a function calling AI model, you can call the following functions:
 ## Available Tools
 {{range .Functions}}
 - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
 {{end}}
 When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
 <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "system" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "assistant" -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "tool" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if .FunctionCall -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
 {{- end -}}`,
 		},
 		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
 	},
 	Phi3: {
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input}}\n<|assistant|>",
 			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
 			Completion:  "{{.Input}}",
 		},
 		StopWords: []string{"<|end|>", "<|endoftext|>"},
 	},
 	ChatML: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}\n<|im_start|>assistant",
 			Functions: `<|im_start|>system
 You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
 {{range .Functions}}
 {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
 {{end}}
 For each function call return a json object with function name and arguments
 <|im_end|>
 {{.Input -}}
 <|im_start|>assistant`,
 			ChatMessage: `<|im_start|>{{ .RoleName }}
 {{ if .FunctionCall -}}
 Function call:
 {{ else if eq .RoleName "tool" -}}
 Function response:
 {{ end -}}
 {{ if .Content -}}
 {{.Content }}
 {{ end -}}
 {{ if .FunctionCall -}}
 {{toJson .FunctionCall}}
 {{ end -}}<|im_end|>`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
 	},
 	Mistral03: {
 		TemplateConfig: TemplateConfig{
 			Chat:      "{{.Input -}}",
 			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 [INST] {{.Content }} [/INST]
 {{- else if .FunctionCall -}}
 [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
 {{- else if eq .RoleName "tool" -}}
 [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
 {{- else -}}
 {{ .Content -}}
 {{ end -}}`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
 	},
 }
 // this maps well known template used in HF to model families defined above
 var knownTemplates = map[string]familyType{
 	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
 	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
 }
 func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 	if defaultCtx == 0 && cfg.ContextSize == nil {
 		ctxSize := f.EstimateLLaMACppRun().ContextSize
 		if ctxSize > 0 {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
 			defaultCtx = defaultContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
 	// GPU options
 	if cfg.Options == nil {
 		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
 			cfg.Options = []string{"gpu"}
 		}
 	}
 	// vram estimation
 	vram, err := xsysinfo.TotalAvailableVRAM()
 	if err != nil {
 		log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
 	} else if vram > 0 {
 		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
 		if err != nil {
 			log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
 		} else {
 			if estimate.IsFullOffload {
 				log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
 			}
 			if estimate.EstimatedVRAM > vram {
 				log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
 			}
 			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
 				log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
 				cfg.NGPULayers = &estimate.EstimatedLayers
 			}
 		}
 	}
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
 		defaultHigh := defaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}
 	log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
 	// template estimations
 	if cfg.HasTemplate() {
 		// nothing to guess here
 		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 		return
 	}
 	log.Debug().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
 		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
 	// guess the name
 	if cfg.Name == "" {
 		cfg.Name = f.Metadata().Name
 	}
 	family := identifyFamily(f)
 	if family == Unknown {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
 		return
 	}
 	// identify template
 	settings, ok := defaultsSettings[family]
 	if ok {
 		cfg.TemplateConfig = settings.TemplateConfig
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
 		if len(cfg.StopWords) == 0 {
 			cfg.StopWords = settings.StopWords
 		}
 		if cfg.RepeatPenalty == 0.0 {
 			cfg.RepeatPenalty = settings.RepeatPenalty
 		}
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
 	if cfg.HasTemplate() {
 		return
 	}
 	// identify from well known templates first, otherwise use the raw jinja template
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found {
 		// try to use the jinja template
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
 }
 func identifyFamily(f *gguf.GGUFFile) familyType {
 	// identify from well known templates first
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found && chatTemplate.ValueString() != "" {
 		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
 			return family
 		}
 	}
 	// otherwise try to identify from the model properties
 	arch := f.Architecture().Architecture
 	eosTokenID := f.Tokenizer().EOSTokenID
 	bosTokenID := f.Tokenizer().BOSTokenID
 	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
 	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
 	llama3 := arch == "llama" && eosTokenID == 128009
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
 	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
 	deepseek2 := arch == "deepseek2"
 	switch {
 	case deepseek2:
 		return DeepSeek2
 	case gemma:
 		return Gemma
 	case llama3:
 		return LLaMa3
 	case commandR:
 		return CommandR
 	case phi3:
 		return Phi3
 	case qwen2, isYI:
 		return ChatML
 	default:
 		return Unknown
 	}
 }
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@ -3,147 +3,12 @@ package config
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	gguf "github.com/gpustack/gguf-parser-go"
 	"github.com/rs/zerolog/log"
 	gguf "github.com/thxcode/gguf-parser-go"
 )
-type familyType uint8
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
 const (
 	Unknown familyType = iota
 	LLaMa3
 	CommandR
 	Phi3
 	ChatML
 	Mistral03
 	Gemma
 	DeepSeek2
 )
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
 	RepeatPenalty  float64
 }
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
 		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
 			Completion:  "{{.Input}}",
 		},
 	},
 	DeepSeek2: {
 		StopWords: []string{"<｜end▁of▁sentence｜>"},
 		TemplateConfig: TemplateConfig{
 			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
 {{ end -}}
 {{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
 {{if eq .RoleName "system" -}}{{.Content}}
 {{end -}}`,
 			Chat: "{{.Input -}}\nAssistant: ",
 		},
 	},
 	LLaMa3: {
 		StopWords: []string{"<|eot_id|>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
 			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
 		},
 	},
 	CommandR: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
 			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
 You are a function calling AI model, you can call the following functions:
 ## Available Tools
 {{range .Functions}}
 - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
 {{end}}
 When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
 <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "system" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "assistant" -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "tool" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if .FunctionCall -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
 {{- end -}}`,
 		},
 		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
 	},
 	Phi3: {
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input}}\n<|assistant|>",
 			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
 			Completion:  "{{.Input}}",
 		},
 		StopWords: []string{"<|end|>", "<|endoftext|>"},
 	},
 	ChatML: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}\n<|im_start|>assistant",
 			Functions: `<|im_start|>system
 You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
 {{range .Functions}}
 {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
 {{end}}
 For each function call return a json object with function name and arguments
 <|im_end|>
 {{.Input -}}
 <|im_start|>assistant`,
 			ChatMessage: `<|im_start|>{{ .RoleName }}
 {{ if .FunctionCall -}}
 Function call:
 {{ else if eq .RoleName "tool" -}}
 Function response:
 {{ end -}}
 {{ if .Content -}}
 {{.Content }}
 {{ end -}}
 {{ if .FunctionCall -}}
 {{toJson .FunctionCall}}
 {{ end -}}<|im_end|>`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
 	},
 	Mistral03: {
 		TemplateConfig: TemplateConfig{
 			Chat:      "{{.Input -}}",
 			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 [INST] {{.Content }} [/INST]
 {{- else if .FunctionCall -}}
 [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
 {{- else if eq .RoleName "tool" -}}
 [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
 {{- else -}}
 {{ .Content -}}
 {{ end -}}`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
 	},
 }
 // this maps well known template used in HF to model families defined above
 var knownTemplates = map[string]familyType{
 	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
 	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
 }
 func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 		return
 	}
 	if cfg.HasTemplate() {
 		// nothing to guess here
 		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 		return
 	}
 	// We try to guess only if we don't have a template defined already
 	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
 	// try to parse the gguf file
 	f, err := gguf.ParseGGUFFile(guessPath)
-	if err != nil {
+	if err == nil {
-		// Only valid for gguf files
+		guessGGUFFromFile(cfg, f, defaultCtx)
 		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
 		return
 	}
-	log.Debug().
+	if cfg.ContextSize == nil {
-		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		if defaultCtx == 0 {
-		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+			defaultCtx = defaultContextSize
 		Any("modelName", f.Model().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
 	// guess the name
 	if cfg.Name == "" {
 		cfg.Name = f.Model().Name
 	}
 	family := identifyFamily(f)
 	if family == Unknown {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
 		return
 	}
 	// identify template
 	settings, ok := defaultsSettings[family]
 	if ok {
 		cfg.TemplateConfig = settings.TemplateConfig
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
 		if len(cfg.StopWords) == 0 {
 			cfg.StopWords = settings.StopWords
 		}
-		if cfg.RepeatPenalty == 0.0 {
+		cfg.ContextSize = &defaultCtx
 			cfg.RepeatPenalty = settings.RepeatPenalty
 		}
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
 	if cfg.HasTemplate() {
 		return
 	}
 	// identify from well known templates first, otherwise use the raw jinja template
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found {
 		// try to use the jinja template
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
 }
 func identifyFamily(f *gguf.GGUFFile) familyType {
 	// identify from well known templates first
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found && chatTemplate.ValueString() != "" {
 		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
 			return family
 		}
 	}
 	// otherwise try to identify from the model properties
 	arch := f.Architecture().Architecture
 	eosTokenID := f.Tokenizer().EOSTokenID
 	bosTokenID := f.Tokenizer().BOSTokenID
 	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
 	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
 	llama3 := arch == "llama" && eosTokenID == 128009
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
 	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
 	deepseek2 := arch == "deepseek2"
 	switch {
 	case deepseek2:
 		return DeepSeek2
 	case gemma:
 		return Gemma
 	case llama3:
 		return LLaMa3
 	case commandR:
 		return CommandR
 	case phi3:
 		return Phi3
 	case qwen2, isYI:
 		return ChatML
 	default:
 		return Unknown
 	}
 }
--- a/Show more
+++ b/Show more
		`@ -1,2 +0,0 @@`
			`--extra-index-url https://download.pytorch.org/whl/cu118`
			`torch==2.4.1+cu118`
		`@ -1,2 +0,0 @@`
			`--extra-index-url https://download.pytorch.org/whl/rocm6.0`
			`torch==2.4.1+rocm6.0`