chore(model gallery): add a-m-team_am-thinking-v1 (#5395 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore(model gallery): add soob3123_grayline-qwen3-8b (#5394 )
2025-05-20 02:24:59 +00:00 · 2025-05-19 17:31:38 +02:00 · 2025-05-19 17:02:43 +02:00 · 2025-05-19 15:59:07 +02:00 · 2025-05-19 01:25:22 +00:00 · 2025-05-18 08:55:44 +02:00
125 changed files with 4078 additions and 28557 deletions
--- a/.env
+++ b/.env
@ -76,7 +76,7 @@

 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""

 ### Enable to run parallel requests
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/autogptq"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@ -12,7 +12,7 @@ jobs:
          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
-          - repository: "ggerganov/whisper.cpp"
+          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
          - repository: "PABannier/bark.cpp"
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.3.0
+        uses: dependabot/fetch-metadata@v2.4.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@ -42,7 +42,7 @@ jobs:
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
-        uses: appleboy/scp-action@v0.1.7
+        uses: appleboy/scp-action@v1.0.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@ -33,6 +33,7 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
+      fail-fast: false
      matrix:
        include:
          # This is basically covered by the AIO test
@ -56,26 +57,35 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'hipblas'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   tag-suffix: '-hipblas'
-          #   ffmpeg: 'false'
-          #   image-type: 'extras'
-          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
-          #   grpc-base-image: "ubuntu:22.04"
-          #   runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
-          #   tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg'
-          #   ffmpeg: 'true'
-          #   image-type: 'extras'
-          #   runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'vulkan'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-vulkan-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
  # core-image-build:
  #   uses: ./.github/workflows/image_build.yml
  #   with:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@ -45,13 +45,13 @@ jobs:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
+            tag-suffix: '-hipblas-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
+            latest-image: 'latest-gpu-hipblas-extras'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@ -59,32 +59,13 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-hipblas'
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@ -114,110 +95,58 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
-          # Extra images
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
+            tag-suffix: '-cublas-cuda11-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
-            latest-image: 'latest-gpu-nvidia-cuda-11'
+            latest-image: 'latest-gpu-nvidia-cuda-11-extras'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-12"
-            latest-image: 'latest-gpu-nvidia-cuda-12'
+            latest-image: 'latest-gpu-nvidia-cuda-12-extras'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-ffmpeg'
+            tag-suffix: '-sycl-f16-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f16"
-            latest-image: 'latest-gpu-intel-f16'
+            latest-image: 'latest-gpu-intel-f16-extras'
            latest-image-aio: 'latest-aio-gpu-intel-f16'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-ffmpeg'
+            tag-suffix: '-sycl-f32-extras'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f32"
-            latest-image: 'latest-gpu-intel-f32'
+            latest-image: 'latest-gpu-intel-f32-extras'
            latest-image-aio: 'latest-aio-gpu-intel-f32'
            makeflags: "--jobs=3 --output-sync=target"
          # Core images
@ -226,41 +155,23 @@ jobs:
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-core'
-            ffmpeg: 'false'
+            tag-suffix: '-sycl-f16'
+            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f16'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-ffmpeg-core'
+            tag-suffix: '-sycl-f32'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f32'

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@ -293,7 +204,7 @@ jobs:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: '-ffmpeg-core'
+            tag-suffix: ''
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
@ -308,60 +219,38 @@ jobs:
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: ''
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-core'
-            ffmpeg: ''
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg-core'
+            tag-suffix: '-cublas-cuda11'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
+            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            tag-suffix: '-cublas-cuda12'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan-ffmpeg-core'
-            latest-image: 'latest-vulkan-ffmpeg-core'
+            tag-suffix: '-vulkan'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-vulkan'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
@ -394,8 +283,8 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'false'
-            tag-suffix: '-nvidia-l4t-arm64-core'
-            latest-image: 'latest-nvidia-l4t-arm64-core'
+            tag-suffix: '-nvidia-l4t-arm64'
+            latest-image: 'latest-nvidia-l4t-arm64'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -36,6 +36,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
+          make install-go-tools
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@ -123,7 +124,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -151,6 +152,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
+          make install-go-tools
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@ -232,7 +234,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -253,8 +255,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          make install-go-tools
      - name: Build
        id: build
        run: |
@ -275,7 +276,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -295,8 +296,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          make install-go-tools
      - name: Build
        id: build
        run: |
@ -317,7 +317,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.3
+        uses: securego/gosec@v2.22.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@ -78,6 +78,26 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test

+  #tests-vllm:
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #    - name: Clone
+  #      uses: actions/checkout@v4
+  #      with:
+  #        submodules: true
+  #    - name: Dependencies
+  #      run: |
+  #        sudo apt-get update
+  #        sudo apt-get install -y build-essential ffmpeg
+  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #        sudo apt-get install -y libopencv-dev
+  #        # Install UV
+  #        curl -LsSf https://astral.sh/uv/install.sh | sh
+  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #    - name: Test vllm backend
+  #      run: |
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -71,7 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev
+          sudo apt-get install -y libgmock-dev clang
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@ -96,6 +96,7 @@ jobs:

          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install github.com/GeertJohan/go.rice/rice@latest

          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@ -130,7 +131,7 @@ jobs:
          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -183,6 +184,7 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
@ -194,7 +196,7 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
@ -222,6 +224,7 @@ jobs:
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
          pip install --user --no-cache-dir grpcio-tools
+          go install github.com/GeertJohan/go.rice/rice@latest
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@ -232,7 +235,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.22
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/17
+++ b/17
@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@ -46,9 +46,10 @@ EOT
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin

-# Install grpc compilers
+# Install grpc compilers and rice
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
+    go install github.com/GeertJohan/go.rice/rice@latest

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@ -300,10 +301,9 @@ COPY .git .
 RUN make prepare

 ## Build the binary
-## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-## (both will use CUDA or hipblas for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
+## Otherwise just run the normal build
+RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@ -431,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/autogptq \
-    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/87
+++ b/87
@ -6,11 +6,11 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=d6d2c2ab8c8865784ba9fef37f2b2de3f2134d33
+CPPLLAMA_VERSION?=6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c

 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
+WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
+WHISPER_CPP_VERSION?=d1f114da61b1ae1e70b03104fad42c9dd666feeb

 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
@ -24,14 +24,21 @@ BARKCPP_VERSION?=v1.0.0
 STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
 STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae

+# ONEAPI variables for SYCL
+export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=
+export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
+export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export BACKEND_LIBS?=
+export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
+export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
+export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src

 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
@ -81,6 +88,7 @@ endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif

 # Detect if we are running on arm64
@ -108,13 +116,31 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
+		GO_LDFLAGS_WHISPER+=-lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif

 	ifeq ($(BUILD_TYPE),metal)
-#			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
+		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
+		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
+		WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
+	else
+		CGO_LDFLAGS_WHISPER+=-lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
@ -126,21 +152,29 @@ ifeq ($(BUILD_TYPE),openblas)
 endif

 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
 	export GGML_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+	WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
+	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
 endif

 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
+	WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
+	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
 endif

 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
+	CMAKE_ARGS+=-DGGML_SYCL=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
+	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@ -151,7 +185,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
-	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
+	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@ -286,8 +320,9 @@ sources/whisper.cpp:
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
+sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
+	cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
+	cd sources/whisper.cpp/build && cmake --build . --config Release

 get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp

@ -337,8 +372,14 @@ clean-tests:
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets

+## Install Go tools
+install-go-tools:
+	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+	go install github.com/GeertJohan/go.rice/rice@latest
+
 ## Build:
-build: prepare backend-assets grpcs ## Build the project
+build: prepare backend-assets grpcs install-go-tools ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@ -348,7 +389,9 @@ ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
 endif
+	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
+	rice append --exec $(BINARY_NAME)

 build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@ -420,6 +463,7 @@ prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models

+## Test targets
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts debug"
@ -494,7 +538,7 @@ protogen: protogen-go protogen-python
 protogen-clean: protogen-go-clean protogen-python-clean

 .PHONY: protogen-go
-protogen-go:
+protogen-go: install-go-tools
 	mkdir -p pkg/grpc/proto
 	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
@ -505,18 +549,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
-
-.PHONY: autogptq-protogen
-autogptq-protogen:
-	$(MAKE) -C backend/python/autogptq protogen
-
-.PHONY: autogptq-protogen-clean
-autogptq-protogen-clean:
-	$(MAKE) -C backend/python/autogptq protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@ -593,7 +629,6 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
-	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@ -607,10 +642,12 @@ prepare-extra-conda-environments: protogen-python
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/vllm

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
+	$(MAKE) -C backend/python/vllm test

 backend-assets:
 	mkdir -p backend-assets
@ -752,8 +789,8 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/silero-vad
 endif

-backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
--- a/README.md
+++ b/README.md
@ -30,7 +30,7 @@

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@ -43,7 +43,8 @@

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

@ -103,28 +104,93 @@
 Run the installer script:

 ```bash
+# Basic installation
 curl https://localai.io/install.sh | sh
 ```

+For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
+
 Or run with docker:

 ### CPU only image:
-```bash
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-```
-### Nvidia GPU:
-```bash
-docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-```
-### CPU and GPU image (bigger size):
+
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ```
-### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+
+### NVIDIA GPU Images:
+
 ```bash
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# CUDA 12.0 with core features
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
+
+# CUDA 12.0 with extra Python dependencies
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12-extras
+
+# CUDA 11.7 with core features
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
+
+# CUDA 11.7 with extra Python dependencies
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11-extras
+
+# NVIDIA Jetson (L4T) ARM64
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
 ```

+### AMD GPU Images (ROCm):
+
+```bash
+# ROCm with core features
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
+
+# ROCm with extra Python dependencies
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas-extras
+```
+
+### Intel GPU Images (oneAPI):
+
+```bash
+# Intel GPU with FP16 support
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
+
+# Intel GPU with FP16 support and extra dependencies
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16-extras
+
+# Intel GPU with FP32 support
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
+
+# Intel GPU with FP32 support and extra dependencies
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32-extras
+```
+
+### Vulkan GPU Images:
+
+```bash
+# Vulkan with core features
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
+```
+
+### AIO Images (pre-downloaded models):
+
+```bash
+# CPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+
+# NVIDIA CUDA 12 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+
+# NVIDIA CUDA 11 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
+
+# Intel GPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
+
+# AMD GPU version
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
+```
+
+For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
+
 To load models:

 ```bash
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@ -48,6 +48,6 @@ template:
    <|im_start|>assistant

 download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
-  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
-  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
+  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/assets.go
+++ b/assets.go
@ -1,6 +1,15 @@
 package main

-import "embed"
+import (
+	rice "github.com/GeertJohan/go.rice"
+)

-//go:embed backend-assets/*
-var backendAssets embed.FS
+var backendAssets *rice.Box
+
+func init() {
+	var err error
+	backendAssets, err = rice.FindBox("backend-assets")
+	if err != nil {
+		panic(err)
+	}
+}
--- a/backend/backend.proto
+++ b/backend/backend.proto
@ -14,6 +14,7 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
+  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@ -190,11 +191,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
@ -305,6 +302,19 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }

+message GenerateVideoRequest {
+  string prompt = 1;
+  string start_image = 2;  // Path or base64 encoded image for the start frame
+  string end_image = 3;    // Path or base64 encoded image for the end frame
+  int32 width = 4;
+  int32 height = 5;
+  int32 num_frames = 6;    // Number of frames to generate
+  int32 fps = 7;          // Frames per second
+  int32 seed = 8;
+  float cfg_scale = 9;    // Classifier-free guidance scale
+  string dst = 10;        // Output path for the generated video
+}
+
 message TTSRequest {
  string text = 1;
  string model = 2;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@ -1,17 +1,17 @@

 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
-set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-endif()
+# set(TARGET myclip)
+# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
+# install(TARGETS ${TARGET} LIBRARY)
+# target_include_directories(myclip PUBLIC .)
+# target_include_directories(myclip PUBLIC ../..)
+# target_include_directories(myclip PUBLIC ../../common)
+# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
+# if (NOT MSVC)
+#     target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+# endif()
 # END CLIP hack


@ -74,8 +74,12 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
+
+target_include_directories(${TARGET} PRIVATE ../llava)
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@ -59,8 +59,8 @@ llama.cpp:
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-llama.cpp/examples/grpc-server: llama.cpp
-	mkdir -p llama.cpp/examples/grpc-server
+llama.cpp/tools/grpc-server: llama.cpp
+	mkdir -p llama.cpp/tools/grpc-server
 	bash prepare.sh

 rebuild:
@ -70,13 +70,13 @@ rebuild:

 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/examples/grpc-server
+	rm -rf llama.cpp/tools/grpc-server
 	rm -rf grpc-server

 clean: purge
 	rm -rf llama.cpp

-grpc-server: llama.cpp llama.cpp/examples/grpc-server
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@ -1,7 +1,7 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index 3cd0d2fa..6c5e811a 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
+--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@ -7,22 +7,46 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 

-cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
-cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
-cp -rfv json.hpp llama.cpp/examples/grpc-server/
-cp -rfv utils.hpp llama.cpp/examples/grpc-server/
-    
-if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+set -e
+
+cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
+
+set +e
+if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
+set -e

-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
-cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
+# and remove the main function
+# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
+awk '
+/int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
+    in_main=1;                     # Set a flag
+    open_braces=0;                 # Track number of open braces
+}
+in_main {
+    open_braces += gsub(/\{/, "{"); # Count opening braces
+    open_braces -= gsub(/\}/, "}"); # Count closing braces
+    if (open_braces == 0) {         # If all braces are closed
+        in_main=0;                  # End skipping
+    }
+    next;                           # Skip lines inside main
+}
+!in_main                           # Print lines not inside main
+' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
+
+# remove index.html.gz.hpp and loading.html.hpp includes
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # macOS
+    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+else
+    # Linux and others
+    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+fi
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@ -1,483 +0,0 @@
-// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-#include <condition_variable>
-#include <unordered_map>
-
-#include "json.hpp"
-
-#include "../llava/clip.h"
-
-using json = nlohmann::json;
-
-extern bool server_verbose;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
-
-//
-// parallel
-//
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-    SERVER_STATE_ERROR           // An error occurred, load_model failed
-};
-
-enum task_type {
-    TASK_TYPE_COMPLETION,
-    TASK_TYPE_CANCEL,
-    TASK_TYPE_NEXT_RESPONSE
-};
-
-struct task_server {
-    int id = -1; // to be filled by llama_server_queue
-    int target_id;
-    task_type type;
-    json data;
-    bool infill_mode = false;
-    bool embedding_mode = false;
-    int multitask_id = -1;
-};
-
-struct task_result {
-    int id;
-    int multitask_id = -1;
-    bool stop;
-    bool error;
-    json result_json;
-};
-
-struct task_multi {
-    int id;
-    std::set<int> subtasks_remaining{};
-    std::vector<task_result> results{};
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state
-{
-    IDLE,
-    PROCESSING,
-};
-
-enum slot_command
-{
-    NONE,
-    LOAD_PROMPT,
-    RELEASE,
-};
-
-struct slot_params
-{
-    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
-
-    uint32_t seed      = -1; // RNG seed
-    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t  n_predict = -1; // new tokens to predict
-
-    std::vector<std::string> antiprompt;
-
-    json input_prefix;
-    json input_suffix;
-};
-
-struct slot_image
-{
-    int32_t id;
-
-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
-};
-
-// completion token output with probabilities
-struct completion_token_output
-{
-    struct token_prob
-    {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-    llama_token tok;
-    std::string text_to_send;
-};
-
-static inline void server_log(const char *level, const char *function, int line,
-                       const char *message, const nlohmann::ordered_json &extra)
-{
-    nlohmann::ordered_json log
-    {
-        {"timestamp", time(nullptr)},
-        {"level",     level},
-        {"function",  function},
-        {"line",      line},
-        {"message",   message},
-    };
-
-    if (!extra.empty())
-    {
-        log.merge_patch(extra);
-    }
-
-    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
-    fflush(stdout);
-}
-
-//
-// server utils
-//
-
-template <typename T>
-static T json_value(const json &body, const std::string &key, const T &default_value)
-{
-    // Fallback null to default value
-    return body.contains(key) && !body.at(key).is_null()
-        ? body.value(key, default_value)
-        : default_value;
-}
-
-inline std::string format_chatml(std::vector<json> messages)
-{
-    std::ostringstream chatml_msgs;
-
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
-        chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
-        chatml_msgs << json_value(*it, "content", std::string(""))
-                    << "<|im_end|>\n";
-    }
-
-    chatml_msgs << "<|im_start|>assistant" << '\n';
-
-    return chatml_msgs.str();
-}
-
-//
-// work queue utils
-//
-
-struct llama_server_queue {
-    int id = 0;
-    std::mutex mutex_tasks;
-    // queues
-    std::vector<task_server> queue_tasks;
-    std::vector<task_server> queue_tasks_deferred;
-    std::vector<task_multi> queue_multitasks;
-    std::condition_variable condition_tasks;
-    // callback functions
-    std::function<void(task_server&)> callback_new_task;
-    std::function<void(task_multi&)> callback_finish_multitask;
-    std::function<void(void)> callback_all_task_finished;
-
-    // Add a new task to the end of the queue
-    int post(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        if (task.id == -1) {
-            task.id = id++;
-        }
-        queue_tasks.push_back(std::move(task));
-        condition_tasks.notify_one();
-        return task.id;
-    }
-
-    // Add a new task, but defer until one slot is available
-    void defer(task_server task) {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        queue_tasks_deferred.push_back(std::move(task));
-    }
-
-    // Get the next id for creating anew task
-    int get_new_id() {
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        return id++;
-    }
-
-    // Register function to process a new task
-    void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
-    }
-
-    // Register function to process a multitask
-    void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
-    }
-
-    // Register the function to be called when the batch of tasks is finished
-    void on_all_tasks_finished(std::function<void(void)> callback) {
-        callback_all_task_finished = callback;
-    }
-
-    // Call when the state of one slot is changed
-    void notify_slot_changed() {
-        // move deferred tasks back to main loop
-        std::unique_lock<std::mutex> lock(mutex_tasks);
-        for (auto & task : queue_tasks_deferred) {
-            queue_tasks.push_back(std::move(task));
-        }
-        queue_tasks_deferred.clear();
-    }
-
-    // Start the main loop. This call is blocking
-    [[noreturn]]
-    void start_loop() {
-        while (true) {
-            // new task arrived
-            LOG_VERBOSE("have new task", {});
-            {
-                while (true)
-                {
-                    std::unique_lock<std::mutex> lock(mutex_tasks);
-                    if (queue_tasks.empty()) {
-                        lock.unlock();
-                        break;
-                    }
-                    task_server task = queue_tasks.front();
-                    queue_tasks.erase(queue_tasks.begin());
-                    lock.unlock();
-                    LOG_VERBOSE("callback_new_task", {});
-                    callback_new_task(task);
-                }
-                LOG_VERBOSE("callback_all_task_finished", {});
-                // process and update all the multitasks
-                auto queue_iterator = queue_multitasks.begin();
-                while (queue_iterator != queue_multitasks.end())
-                {
-                    if (queue_iterator->subtasks_remaining.empty())
-                    {
-                        // all subtasks done == multitask is done
-                        task_multi current_multitask = *queue_iterator;
-                        callback_finish_multitask(current_multitask);
-                        // remove this multitask
-                        queue_iterator = queue_multitasks.erase(queue_iterator);
-                    }
-                    else
-                    {
-                        ++queue_iterator;
-                    }
-                }
-                // all tasks in the current loop is finished
-                callback_all_task_finished();
-            }
-            LOG_VERBOSE("wait for new task", {});
-            // wait for new task
-            {
-                std::unique_lock<std::mutex> lock(mutex_tasks);
-                if (queue_tasks.empty()) {
-                    condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
-                    });
-                }
-            }
-        }
-    }
-
-    //
-    // functions to manage multitasks
-    //
-
-    // add a multitask by specifying the id of all subtask (subtask is a task_server)
-    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        task_multi multi;
-        multi.id = multitask_id;
-        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
-        queue_multitasks.push_back(multi);
-    }
-
-    // updatethe remaining subtasks, while appending results to multitask
-    void update_multitask(int multitask_id, int subtask_id, task_result& result)
-    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
-        for (auto& multitask : queue_multitasks)
-        {
-            if (multitask.id == multitask_id)
-            {
-                multitask.subtasks_remaining.erase(subtask_id);
-                multitask.results.push_back(result);
-            }
-        }
-    }
-};
-
-struct llama_server_response {
-    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
-    callback_multitask_t callback_update_multitask;
-    // for keeping track of all tasks waiting for the result
-    std::set<int> waiting_task_ids;
-    // the main result queue
-    std::vector<task_result> queue_results;
-    std::mutex mutex_results;
-    std::condition_variable condition_results;
-
-    void add_waiting_task_id(int task_id) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.insert(task_id);
-    }
-
-    void remove_waiting_task_id(int task_id) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        waiting_task_ids.erase(task_id);
-    }
-
-    // This function blocks the thread until there is a response for this task_id
-    task_result recv(int task_id) {
-        while (true)
-        {
-            std::unique_lock<std::mutex> lock(mutex_results);
-            condition_results.wait(lock, [&]{
-                return !queue_results.empty();
-            });
-            LOG_VERBOSE("condition_results unblock", {});
-
-            for (int i = 0; i < (int) queue_results.size(); i++)
-            {
-                if (queue_results[i].id == task_id)
-                {
-                    assert(queue_results[i].multitask_id == -1);
-                    task_result res = queue_results[i];
-                    queue_results.erase(queue_results.begin() + i);
-                    return res;
-                }
-            }
-        }
-
-        // should never reach here
-    }
-
-    // Register the function to update multitask
-    void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
-    }
-
-    // Send a new result to a waiting task_id
-    void send(task_result result) {
-        std::unique_lock<std::mutex> lock(mutex_results);
-        LOG_VERBOSE("send new result", {});
-        for (auto& task_id : waiting_task_ids) {
-            // LOG_TEE("waiting task id %i \n", task_id);
-            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
-            if (result.multitask_id == task_id)
-            {
-                LOG_VERBOSE("callback_update_multitask", {});
-                callback_update_multitask(task_id, result.id, result);
-                continue;
-            }
-
-            if (result.id == task_id)
-            {
-                LOG_VERBOSE("queue_results.push_back", {});
-                queue_results.push_back(result);
-                condition_results.notify_one();
-                return;
-            }
-        }
-    }
-};
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-             "abcdefghijklmnopqrstuvwxyz"
-             "0123456789+/";
-
-static inline bool is_base64(uint8_t c)
-{
-    return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
-{
-    int i = 0;
-    int j = 0;
-    int in_ = 0;
-
-    int in_len = encoded_string.size();
-
-    uint8_t char_array_4[4];
-    uint8_t char_array_3[3];
-
-    std::vector<uint8_t> ret;
-
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
-    {
-        char_array_4[i++] = encoded_string[in_]; in_++;
-        if (i == 4)
-        {
-            for (i = 0; i <4; i++)
-            {
-                char_array_4[i] = base64_chars.find(char_array_4[i]);
-            }
-
-            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-            for (i = 0; (i < 3); i++)
-            {
-                ret.push_back(char_array_3[i]);
-            }
-            i = 0;
-        }
-    }
-
-    if (i)
-    {
-        for (j = i; j <4; j++)
-        {
-            char_array_4[j] = 0;
-        }
-
-        for (j = 0; j <4; j++)
-        {
-            char_array_4[j] = base64_chars.find(char_array_4[j]);
-        }
-
-        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
-        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
-
-        for (j = 0; (j < i - 1); j++)
-        {
-            ret.push_back(char_array_3[j]);
-        }
-    }
-
-    return ret;
-}
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@ -48,7 +48,7 @@ int tts(char *text,int  threads, char *dst ) {

    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
-        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }

--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@ -20,7 +20,7 @@ CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CMAKE_ARGS+=-DSD_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
@ -30,14 +30,14 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DSD_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
+		CMAKE_ARGS+=-DSD_METAL=OFF
 	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DSD_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
 		context.SetTranslate(true)
 	}

-	if err := context.Process(data, nil, nil); err != nil {
+	if err := context.Process(data, nil, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}

--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@ -1,17 +0,0 @@
-.PHONY: autogptq
-autogptq: protogen
-	bash install.sh
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@ -1,5 +0,0 @@
-# Creating a separate environment for the autogptq project
-
-```
-make autogptq
-```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-from concurrent import futures
-import argparse
-import signal
-import sys
-import os
-import time
-import base64
-
-import grpc
-import backend_pb2
-import backend_pb2_grpc
-
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import TextGenerationPipeline
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            device = "cuda:0"
-            if request.Device != "":
-                device = request.Device
-
-            # support loading local model files
-            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
-            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
-
-            # support model `Qwen/Qwen-VL-Chat-Int4`
-            if "qwen-vl" in request.Model.lower():
-                self.model_name = "Qwen-VL-Chat"
-                model = AutoModelForCausalLM.from_pretrained(model_path, 
-                    trust_remote_code=request.TrustRemoteCode,
-                    device_map="auto").eval()
-            else:
-                model = AutoGPTQForCausalLM.from_quantized(model_path,
-                    model_basename=request.ModelBaseName,
-                    use_safetensors=True,
-                    trust_remote_code=request.TrustRemoteCode,
-                    device=device,
-                    use_triton=request.UseTriton,
-                    quantize_config=None)
-            
-            self.model = model
-            self.tokenizer = tokenizer
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.0
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        top_p = 0.95
-        if request.TopP != 0.0:
-            top_p = request.TopP
-
-        
-        prompt_images = self.recompile_vl_prompt(request)
-        compiled_prompt = prompt_images[0]
-        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
-
-        # Implement Predict RPC
-        pipeline = TextGenerationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer,
-            max_new_tokens=tokens,
-            temperature=request.Temperature,
-            top_p=top_p,
-            repetition_penalty=penalty,
-            )
-        t = pipeline(compiled_prompt)[0]["generated_text"]
-        print(f"generated_text: {t}", file=sys.stderr)
-        
-        if compiled_prompt in t:
-            t = t.replace(compiled_prompt, "")
-        # house keeping. Remove the image files from /tmp folder
-        for img_path in prompt_images[1]:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-    def recompile_vl_prompt(self, request):
-        prompt = request.Prompt
-        image_paths = []
-
-        if "qwen-vl" in self.model_name.lower():
-            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
-            # Then, save the image file paths to an array "image_paths".
-            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
-            for i, img in enumerate(request.Images):
-                timestamp = str(int(time.time() * 1000))  # Generate timestamp
-                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
-                with open(img_path, "wb") as f:
-                    f.write(base64.b64decode(img))
-                image_paths.append(img_path)
-                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
-        else:
-            prompt = request.Prompt
-        return (prompt, image_paths)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
-if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
-fi
-
-installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@ -1 +0,0 @@
-torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@ -1,6 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
-optimum[openvino]
-setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@ -1,6 +0,0 @@
-accelerate
-auto-gptq==0.7.1
-grpcio==1.71.0
-protobuf
-certifi
-transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@ -168,9 +168,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # We are storing all the options in a dict so we can use it later when
            # generating the images
            for opt in options:
+                if ":" not in opt:
+                    continue
                key, value = opt.split(":")
                self.options[key] = value

+            print(f"Options: {self.options}", file=sys.stderr)
+
            local = False
            modelFile = request.Model

@ -522,7 +526,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@ -1,5 +1,5 @@
 setuptools
-grpcio==1.71.0
+grpcio==1.72.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@ -1,3 +1,3 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@ -194,27 +194,40 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            await iterations.aclose()

    async def _predict(self, request, context, streaming=False):
+        # Build the sampling parameters
+        # NOTE: this must stay in sync with the vllm backend
+        request_to_sampling_params = {
+            "N": "n",
+            "PresencePenalty": "presence_penalty",
+            "FrequencyPenalty": "frequency_penalty",
+            "RepetitionPenalty": "repetition_penalty",
+            "Temperature": "temperature",
+            "TopP": "top_p",
+            "TopK": "top_k",
+            "MinP": "min_p",
+            "Seed": "seed",
+            "StopPrompts": "stop",
+            "StopTokenIds": "stop_token_ids",
+            "BadWords": "bad_words",
+            "IncludeStopStrInOutput": "include_stop_str_in_output",
+            "IgnoreEOS": "ignore_eos",
+            "Tokens": "max_tokens",
+            "MinTokens": "min_tokens",
+            "Logprobs": "logprobs",
+            "PromptLogprobs": "prompt_logprobs",
+            "SkipSpecialTokens": "skip_special_tokens",
+            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
+            "TruncatePromptTokens": "truncate_prompt_tokens",
+            "GuidedDecoding": "guided_decoding",
+        }

-        # Build sampling parameters
        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
-        if request.TopP != 0:
-            sampling_params.top_p = request.TopP
-        if request.Tokens > 0:
-            sampling_params.max_tokens = request.Tokens
-        if request.Temperature != 0:
-            sampling_params.temperature = request.Temperature
-        if request.TopK != 0:
-            sampling_params.top_k = request.TopK
-        if request.PresencePenalty != 0:
-            sampling_params.presence_penalty = request.PresencePenalty
-        if request.FrequencyPenalty != 0:
-            sampling_params.frequency_penalty = request.FrequencyPenalty
-        if request.StopPrompts:
-            sampling_params.stop = request.StopPrompts
-        if request.IgnoreEOS:
-            sampling_params.ignore_eos = request.IgnoreEOS
-        if request.Seed != 0:
-            sampling_params.seed = request.Seed
+
+        for request_field, param_field in request_to_sampling_params.items():
+            if hasattr(request, request_field):
+                value = getattr(request, request_field)
+                if value not in (None, 0, [], False):
+                    setattr(sampling_params, param_field, value)

        # Extract image paths and process images
        prompt = request.Prompt
@ -320,7 +333,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@ -1,4 +1,4 @@
-grpcio==1.71.0
+grpcio==1.72.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@ -75,6 +75,53 @@ class TestBackendServicer(unittest.TestCase):
        finally:
            self.tearDown()

+    def test_sampling_params(self):
+        """
+        This method tests if all sampling parameters are correctly processed
+        NOTE: this does NOT test for correctness, just that we received a compatible response
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
+                self.assertTrue(response.success)
+
+                req = backend_pb2.PredictOptions(
+                    Prompt="The capital of France is",
+                    TopP=0.8,
+                    Tokens=50,
+                    Temperature=0.7,
+                    TopK=40,
+                    PresencePenalty=0.1,
+                    FrequencyPenalty=0.2,
+                    RepetitionPenalty=1.1,
+                    MinP=0.05,
+                    Seed=42,
+                    StopPrompts=["\n"],
+                    StopTokenIds=[50256],
+                    BadWords=["badword"],
+                    IncludeStopStrInOutput=True,
+                    IgnoreEOS=True,
+                    MinTokens=5,
+                    Logprobs=5,
+                    PromptLogprobs=5,
+                    SkipSpecialTokens=True,
+                    SpacesBetweenSpecialTokens=True,
+                    TruncatePromptTokens=10,
+                    GuidedDecoding=True,
+                    N=2,
+                )
+                resp = stub.Predict(req)
+                self.assertIsNotNone(resp.message)
+                self.assertIsNotNone(resp.logprobs)
+        except Exception as err:
+            print(err)
+            self.fail("sampling params service failed")
+        finally:
+            self.tearDown()
+
+
    def test_embedding(self):
        """
        This method tests if the embeddings are generated successfully
--- a/core/application/startup.go
+++ b/core/application/startup.go
@ -43,18 +43,12 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
-	if options.ImageDir != "" {
-		err := os.MkdirAll(options.ImageDir, 0750)
+	if options.GeneratedContentDir != "" {
+		err := os.MkdirAll(options.GeneratedContentDir, 0750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
-	if options.AudioDir != "" {
-		err := os.MkdirAll(options.AudioDir, 0750)
-		if err != nil {
-			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
-		}
-	}
 	if options.UploadDir != "" {
 		err := os.MkdirAll(options.UploadDir, 0750)
 		if err != nil {
--- a/core/backend/options.go
+++ b/core/backend/options.go
@ -99,7 +99,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		mmap = *c.MMap
 	}

-	ctxSize := 1024
+	ctxSize := 4096
 	if c.ContextSize != nil {
 		ctxSize = *c.ContextSize
 	}
@ -184,11 +184,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		MainGPU:             c.MainGPU,
 		Threads:             int32(*c.Threads),
 		TensorSplit:         c.TensorSplit,
-		// AutoGPTQ
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
-		Device:           c.AutoGPTQ.Device,
-		UseTriton:        c.AutoGPTQ.Triton,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@ -35,12 +35,17 @@ func SoundGeneration(
 		return "", nil, fmt.Errorf("could not load sound generation model")
 	}

-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+	if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
-	filePath := filepath.Join(appConfig.AudioDir, fileName)
+	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
+	if err := os.MkdirAll(audioDir, 0750); err != nil {
+		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
+	}
+
+	fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
+	filePath := filepath.Join(audioDir, fileName)

 	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
 		Text:        text,
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@ -32,12 +32,13 @@ func ModelTTS(
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
 	}

-	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
+	if err := os.MkdirAll(audioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

-	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
-	filePath := filepath.Join(appConfig.AudioDir, fileName)
+	fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
+	filePath := filepath.Join(audioDir, fileName)

 	// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
 	// This should be addressed in a follow up PR soon.
--- a/core/backend/video.go
+++ b/core/backend/video.go
@ -0,0 +1,36 @@
+package backend
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+)
+
+func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
+
+	opts := ModelOptions(backendConfig, appConfig)
+	inferenceModel, err := loader.Load(
+		opts...,
+	)
+	if err != nil {
+		return nil, err
+	}
+	defer loader.Close()
+
+	fn := func() error {
+		_, err := inferenceModel.GenerateVideo(
+			appConfig.Context,
+			&proto.GenerateVideoRequest{
+				Height:     height,
+				Width:      width,
+				Prompt:     prompt,
+				StartImage: startImage,
+				EndImage:   endImage,
+				Dst:        dst,
+			})
+		return err
+	}
+
+	return fn, nil
+}
--- a/core/cli/context/context.go
+++ b/core/cli/context/context.go
@ -1,11 +1,13 @@
 package cliContext

-import "embed"
+import (
+	rice "github.com/GeertJohan/go.rice"
+)

 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
 	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`

 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
-	BackendAssets embed.FS `kong:"-"`
+	BackendAssets *rice.Box `kong:"-"`
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@ -21,8 +21,7 @@ type RunCMD struct {

 	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath            string        `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	ImagePath                    string        `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
-	AudioPath                    string        `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
+	GeneratedContentPath         string        `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"`
 	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
 	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
 	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
@ -47,7 +46,7 @@ type RunCMD struct {
 	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
 	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
 	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"`
 	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
 	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
 	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
@ -81,8 +80,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelPath(r.ModelsPath),
 		config.WithContextSize(r.ContextSize),
 		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
-		config.WithImageDir(r.ImagePath),
-		config.WithAudioDir(r.AudioPath),
+		config.WithGeneratedContentDir(r.GeneratedContentPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
 		config.WithDynamicConfigDir(r.LocalaiConfigDir),
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@ -70,7 +70,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	opts := &config.ApplicationConfig{
 		ModelPath:            t.ModelsPath,
 		Context:              context.Background(),
-		AudioDir:             outputDir,
+		GeneratedContentDir:  outputDir,
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@ -36,10 +36,10 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	text := strings.Join(t.Text, " ")

 	opts := &config.ApplicationConfig{
-		ModelPath:         t.ModelsPath,
-		Context:           context.Background(),
-		AudioDir:          outputDir,
-		AssetsDestination: t.BackendAssetsPath,
+		ModelPath:           t.ModelsPath,
+		Context:             context.Background(),
+		GeneratedContentDir: outputDir,
+		AssetsDestination:   t.BackendAssetsPath,
 	}
 	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)

--- a/core/cli/util.go
+++ b/core/cli/util.go
@ -7,11 +7,11 @@ import (

 	"github.com/rs/zerolog/log"

+	gguf "github.com/gpustack/gguf-parser-go"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
-	gguf "github.com/thxcode/gguf-parser-go"
 )

 type UtilCMD struct {
@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	log.Info().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
+		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])

 	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@ -2,11 +2,11 @@ package config

 import (
 	"context"
-	"embed"
 	"encoding/json"
 	"regexp"
 	"time"

+	rice "github.com/GeertJohan/go.rice"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )
@ -19,20 +19,21 @@ type ApplicationConfig struct {
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug                               bool
-	ImageDir                            string
-	AudioDir                            string
-	UploadDir                           string
-	ConfigsDir                          string
-	DynamicConfigsDir                   string
-	DynamicConfigsDirPollInterval       time.Duration
-	CORS                                bool
-	CSRF                                bool
-	PreloadJSONModels                   string
-	PreloadModelsFromPath               string
-	CORSAllowOrigins                    string
-	ApiKeys                             []string
-	P2PToken                            string
-	P2PNetworkID                        string
+	GeneratedContentDir                 string
+
+	ConfigsDir string
+	UploadDir  string
+
+	DynamicConfigsDir             string
+	DynamicConfigsDirPollInterval time.Duration
+	CORS                          bool
+	CSRF                          bool
+	PreloadJSONModels             string
+	PreloadModelsFromPath         string
+	CORSAllowOrigins              string
+	ApiKeys                       []string
+	P2PToken                      string
+	P2PNetworkID                  string

 	DisableWebUI                       bool
 	EnforcePredownloadScans            bool
@ -46,7 +47,7 @@ type ApplicationConfig struct {

 	Galleries []Gallery

-	BackendAssets     embed.FS
+	BackendAssets     *rice.Box
 	AssetsDestination string

 	ExternalGRPCBackends map[string]string
@ -197,7 +198,7 @@ func WithBackendAssetsOutput(out string) AppOption {
 	}
 }

-func WithBackendAssets(f embed.FS) AppOption {
+func WithBackendAssets(f *rice.Box) AppOption {
 	return func(o *ApplicationConfig) {
 		o.BackendAssets = f
 	}
@ -279,15 +280,9 @@ func WithDebug(debug bool) AppOption {
 	}
 }

-func WithAudioDir(audioDir string) AppOption {
+func WithGeneratedContentDir(generatedContentDir string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.AudioDir = audioDir
-	}
-}
-
-func WithImageDir(imageDir string) AppOption {
-	return func(o *ApplicationConfig) {
-		o.ImageDir = imageDir
+		o.GeneratedContentDir = generatedContentDir
 	}
 }

--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@ -50,9 +50,6 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

-	// AutoGPTQ specifics
-	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
-
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@ -176,14 +173,6 @@ type LimitMMPerPrompt struct {
 	LimitAudioPerPrompt int `yaml:"audio"`
 }

-// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
-type AutoGPTQ struct {
-	ModelBaseName    string `yaml:"model_base_name"`
-	Device           string `yaml:"device"`
-	Triton           bool   `yaml:"triton"`
-	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
-}
-
 // TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
 	// Chat is the template used in the chat completion endpoint
@ -315,9 +304,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTFZ := 1.0
 	defaultZero := 0

-	// Try to offload all GPU layers (if GPU is found)
-	defaultHigh := 99999999
-
 	trueV := true
 	falseV := false

@ -377,9 +363,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
-	if cfg.NGPULayers == nil {
-		cfg.NGPULayers = &defaultHigh
-	}

 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
@ -447,18 +430,19 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int

 const (
-	FLAG_ANY              BackendConfigUsecases = 0b00000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b00000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b00000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b00000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b00000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b00000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b00000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b00001000000
-	FLAG_TTS              BackendConfigUsecases = 0b00010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
-	FLAG_TOKENIZE         BackendConfigUsecases = 0b01000000000
-	FLAG_VAD              BackendConfigUsecases = 0b10000000000
+	FLAG_ANY              BackendConfigUsecases = 0b000000000000
+	FLAG_CHAT             BackendConfigUsecases = 0b000000000001
+	FLAG_COMPLETION       BackendConfigUsecases = 0b000000000010
+	FLAG_EDIT             BackendConfigUsecases = 0b000000000100
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000000001000
+	FLAG_RERANK           BackendConfigUsecases = 0b000000010000
+	FLAG_IMAGE            BackendConfigUsecases = 0b000000100000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b000001000000
+	FLAG_TTS              BackendConfigUsecases = 0b000010000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b000100000000
+	FLAG_TOKENIZE         BackendConfigUsecases = 0b001000000000
+	FLAG_VAD              BackendConfigUsecases = 0b010000000000
+	FLAG_VIDEO            BackendConfigUsecases = 0b100000000000

 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@ -479,6 +463,7 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
+		"FLAG_VIDEO":            FLAG_VIDEO,
 	}
 }

@ -543,6 +528,17 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 			return false
 		}

+	}
+	if (u & FLAG_VIDEO) == FLAG_VIDEO {
+		videoBackends := []string{"diffusers", "stablediffusion"}
+		if !slices.Contains(videoBackends, c.Backend) {
+			return false
+		}
+
+		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
+			return false
+		}
+
 	}
 	if (u & FLAG_RERANK) == FLAG_RERANK {
 		if c.Backend != "rerankers" {
@ -555,7 +551,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@ -3,9 +3,10 @@ package config
 import (
 	"strings"

+	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"

-	gguf "github.com/thxcode/gguf-parser-go"
+	gguf "github.com/gpustack/gguf-parser-go"
 )

 type familyType uint8
@ -23,6 +24,7 @@ const (

 const (
 	defaultContextSize = 1024
+	defaultNGPULayers  = 99999999
 )

 type settingsConfig struct {
@ -147,7 +149,7 @@ var knownTemplates = map[string]familyType{
 func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {

 	if defaultCtx == 0 && cfg.ContextSize == nil {
-		ctxSize := f.EstimateLLaMACppUsage().ContextSize
+		ctxSize := f.EstimateLLaMACppRun().ContextSize
 		if ctxSize > 0 {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
@ -157,6 +159,46 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 		}
 	}

+	// GPU options
+	if cfg.Options == nil {
+		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
+			cfg.Options = []string{"gpu"}
+		}
+	}
+
+	// vram estimation
+	vram, err := xsysinfo.TotalAvailableVRAM()
+	if err != nil {
+		log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
+	} else if vram > 0 {
+		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
+		if err != nil {
+			log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
+		} else {
+			if estimate.IsFullOffload {
+				log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
+			}
+
+			if estimate.EstimatedVRAM > vram {
+				log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
+			}
+
+			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
+				log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
+				cfg.NGPULayers = &estimate.EstimatedLayers
+			}
+		}
+	}
+
+	if cfg.NGPULayers == nil {
+		// we assume we want to offload all layers
+		defaultHigh := defaultNGPULayers
+		cfg.NGPULayers = &defaultHigh
+	}
+
+	log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
+
+	// template estimations
 	if cfg.HasTemplate() {
 		// nothing to guess here
 		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
@ -166,12 +208,12 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 	log.Debug().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Model().Name).
+		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())

 	// guess the name
 	if cfg.Name == "" {
-		cfg.Name = f.Model().Name
+		cfg.Name = f.Metadata().Name
 	}

 	family := identifyFamily(f)
@ -207,6 +249,7 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
+
 }

 func identifyFamily(f *gguf.GGUFFile) familyType {
@ -231,7 +274,7 @@ func identifyFamily(f *gguf.GGUFFile) familyType {
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
-	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
+	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
 	deepseek2 := arch == "deepseek2"

 	switch {
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@ -4,8 +4,8 @@ import (
 	"os"
 	"path/filepath"

+	gguf "github.com/gpustack/gguf-parser-go"
 	"github.com/rs/zerolog/log"
-	gguf "github.com/thxcode/gguf-parser-go"
 )

 func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
--- a/core/http/app.go
+++ b/core/http/app.go
@ -5,6 +5,8 @@ import (
 	"errors"
 	"fmt"
 	"net/http"
+	"os"
+	"path/filepath"

 	"github.com/dave-gray101/v2keyauth"
 	"github.com/mudler/LocalAI/pkg/utils"
@ -153,12 +155,19 @@ func API(application *application.Application) (*fiber.App, error) {
 		Browse:     true,
 	}))

-	if application.ApplicationConfig().ImageDir != "" {
-		router.Static("/generated-images", application.ApplicationConfig().ImageDir)
-	}
+	if application.ApplicationConfig().GeneratedContentDir != "" {
+		os.MkdirAll(application.ApplicationConfig().GeneratedContentDir, 0750)
+		audioPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "audio")
+		imagePath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "images")
+		videoPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "videos")

-	if application.ApplicationConfig().AudioDir != "" {
-		router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
+		os.MkdirAll(audioPath, 0750)
+		os.MkdirAll(imagePath, 0750)
+		os.MkdirAll(videoPath, 0750)
+
+		router.Static("/generated-audio", audioPath)
+		router.Static("/generated-images", imagePath)
+		router.Static("/generated-videos", videoPath)
 	}

 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@ -3,7 +3,6 @@ package http_test
 import (
 	"bytes"
 	"context"
-	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
@ -24,6 +23,7 @@ import (
 	. "github.com/onsi/gomega"
 	"gopkg.in/yaml.v3"

+	rice "github.com/GeertJohan/go.rice"
 	openaigo "github.com/otiai10/openaigo"
 	"github.com/sashabaranov/go-openai"
 	"github.com/sashabaranov/go-openai/jsonschema"
@ -264,8 +264,15 @@ func getRequest(url string, header http.Header) (error, int, []byte) {

 const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`

-//go:embed backend-assets/*
-var backendAssets embed.FS
+var backendAssets *rice.Box
+
+func init() {
+	var err error
+	backendAssets, err = rice.FindBox("backend-assets")
+	if err != nil {
+		panic(err)
+	}
+}

 var _ = Describe("API test", func() {

@ -629,8 +636,7 @@ var _ = Describe("API test", func() {
 			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
-					config.WithAudioDir(tmpdir),
-					config.WithImageDir(tmpdir),
+					config.WithGeneratedContentDir(tmpdir),
 					config.WithGalleries(galleries),
 					config.WithModelPath(modelDir),
 					config.WithBackendAssets(backendAssets),
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@ -32,7 +32,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			return fiber.ErrBadRequest
 		}

-		log.Debug().Str("modelName", input.ModelID).Msg("elevenlabs TTS request recieved")
+		log.Debug().Str("modelName", input.ModelID).Msg("elevenlabs TTS request received")

 		filePath, _, err := backend.ModelTTS(input.Text, voiceID, input.LanguageCode, ml, appConfig, *cfg)
 		if err != nil {
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@ -30,7 +30,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
 			return fiber.ErrBadRequest
 		}

-		log.Debug().Str("model", input.Model).Msg("JINA Rerank Request recieved")
+		log.Debug().Str("model", input.Model).Msg("JINA Rerank Request received")

 		request := &proto.RerankRequest{
 			Query:     input.Query,
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@ -34,7 +34,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			return fiber.ErrBadRequest
 		}

-		log.Debug().Str("model", input.Model).Msg("LocalAI TTS Request recieved")
+		log.Debug().Str("model", input.Model).Msg("LocalAI TTS Request received")

 		if cfg.Backend == "" {
 			if input.Backend != "" {
--- a/core/http/endpoints/localai/vad.go
+++ b/core/http/endpoints/localai/vad.go
@ -28,7 +28,7 @@ func VADEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			return fiber.ErrBadRequest
 		}

-		log.Debug().Str("model", input.Model).Msg("LocalAI VAD Request recieved")
+		log.Debug().Str("model", input.Model).Msg("LocalAI VAD Request received")

 		resp, err := backend.VAD(input, c.Context(), ml, appConfig, *cfg)

--- a/core/http/endpoints/localai/video.go
+++ b/core/http/endpoints/localai/video.go
@ -0,0 +1,205 @@
+package localai
+
+import (
+	"bufio"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
+	"github.com/mudler/LocalAI/core/schema"
+
+	"github.com/mudler/LocalAI/core/backend"
+
+	"github.com/gofiber/fiber/v2"
+	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/rs/zerolog/log"
+)
+
+func downloadFile(url string) (string, error) {
+	// Get the data
+	resp, err := http.Get(url)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+
+	// Create the file
+	out, err := os.CreateTemp("", "video")
+	if err != nil {
+		return "", err
+	}
+	defer out.Close()
+
+	// Write the body to file
+	_, err = io.Copy(out, resp.Body)
+	return out.Name(), err
+}
+
+//
+
+/*
+*
+
+	curl http://localhost:8080/v1/images/generations \
+	  -H "Content-Type: application/json" \
+	  -d '{
+	    "prompt": "A cute baby sea otter",
+	    "n": 1,
+	    "size": "512x512"
+	  }'
+
+*
+*/
+// VideoEndpoint
+// @Summary Creates a video given a prompt.
+// @Param request body schema.OpenAIRequest true "query params"
+// @Success 200 {object} schema.OpenAIResponse "Response"
+// @Router /video [post]
+func VideoEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.VideoRequest)
+		if !ok || input.Model == "" {
+			log.Error().Msg("Video Endpoint - Invalid Input")
+			return fiber.ErrBadRequest
+		}
+
+		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if !ok || config == nil {
+			log.Error().Msg("Video Endpoint - Invalid Config")
+			return fiber.ErrBadRequest
+		}
+
+		src := ""
+		if input.StartImage != "" {
+
+			var fileData []byte
+			var err error
+			// check if input.File is an URL, if so download it and save it
+			// to a temporary file
+			if strings.HasPrefix(input.StartImage, "http://") || strings.HasPrefix(input.StartImage, "https://") {
+				out, err := downloadFile(input.StartImage)
+				if err != nil {
+					return fmt.Errorf("failed downloading file:%w", err)
+				}
+				defer os.RemoveAll(out)
+
+				fileData, err = os.ReadFile(out)
+				if err != nil {
+					return fmt.Errorf("failed reading file:%w", err)
+				}
+
+			} else {
+				// base 64 decode the file and write it somewhere
+				// that we will cleanup
+				fileData, err = base64.StdEncoding.DecodeString(input.StartImage)
+				if err != nil {
+					return err
+				}
+			}
+
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(fileData)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		switch config.Backend {
+		case "stablediffusion":
+			config.Backend = model.StableDiffusionGGMLBackend
+		case "":
+			config.Backend = model.StableDiffusionGGMLBackend
+		}
+
+		width := input.Width
+		height := input.Height
+
+		if width == 0 {
+			width = 512
+		}
+		if height == 0 {
+			height = 512
+		}
+
+		b64JSON := input.ResponseFormat == "b64_json"
+
+		tempDir := ""
+		if !b64JSON {
+			tempDir = filepath.Join(appConfig.GeneratedContentDir, "videos")
+		}
+		// Create a temporary file
+		outputFile, err := os.CreateTemp(tempDir, "b64")
+		if err != nil {
+			return err
+		}
+		outputFile.Close()
+
+		// TODO: use mime type to determine the extension
+		output := outputFile.Name() + ".mp4"
+
+		// Rename the temporary file
+		err = os.Rename(outputFile.Name(), output)
+		if err != nil {
+			return err
+		}
+
+		baseURL := c.BaseURL()
+
+		fn, err := backend.VideoGeneration(height, width, input.Prompt, src, input.EndImage, output, ml, *config, appConfig)
+		if err != nil {
+			return err
+		}
+		if err := fn(); err != nil {
+			return err
+		}
+
+		item := &schema.Item{}
+
+		if b64JSON {
+			defer os.RemoveAll(output)
+			data, err := os.ReadFile(output)
+			if err != nil {
+				return err
+			}
+			item.B64JSON = base64.StdEncoding.EncodeToString(data)
+		} else {
+			base := filepath.Base(output)
+			item.URL = baseURL + "/generated-videos/" + base
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    []schema.Item{*item},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@ -72,7 +72,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 			log.Error().Msg("Image Endpoint - Invalid Input")
 			return fiber.ErrBadRequest
 		}
-		
+
 		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
 		if !ok || config == nil {
 			log.Error().Msg("Image Endpoint - Invalid Config")
@ -108,7 +108,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 			}

 			// Create a temporary file
-			outputFile, err := os.CreateTemp(appConfig.ImageDir, "b64")
+			outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
 			if err != nil {
 				return err
 			}
@ -184,7 +184,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon

 				tempDir := ""
 				if !b64JSON {
-					tempDir = appConfig.ImageDir
+					tempDir = filepath.Join(appConfig.GeneratedContentDir, "images")
 				}
 				// Create a temporary file
 				outputFile, err := os.CreateTemp(tempDir, "b64")
@ -192,6 +192,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 					return err
 				}
 				outputFile.Close()
+
 				output := outputFile.Name() + ".png"
 				// Rename the temporary file
 				err = os.Rename(outputFile.Name(), output)
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}

-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@ -59,6 +59,11 @@ func RegisterLocalAIRoutes(router *fiber.App,
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
 	}

+	router.Post("/video",
+		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_VIDEO)),
+		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.VideoRequest) }),
+		localai.VideoEndpoint(cl, ml, appConfig))
+
 	// Backend Statistics Module
 	// TODO: Should these use standard middlewares? Refactor later, they are extremely simple.
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@ -115,6 +115,7 @@ async function sendTextToChatGPT(text) {

    const response = await fetch('v1/chat/completions', {
        method: 'POST',
+        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
            model: getModel(),
            messages: conversationHistory
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@ -24,6 +24,20 @@ type GalleryResponse struct {
 	StatusURL string `json:"status"`
 }

+type VideoRequest struct {
+	BasicModelRequest
+	Prompt         string  `json:"prompt" yaml:"prompt"`
+	StartImage     string  `json:"start_image" yaml:"start_image"`
+	EndImage       string  `json:"end_image" yaml:"end_image"`
+	Width          int32   `json:"width" yaml:"width"`
+	Height         int32   `json:"height" yaml:"height"`
+	NumFrames      int32   `json:"num_frames" yaml:"num_frames"`
+	FPS            int32   `json:"fps" yaml:"fps"`
+	Seed           int32   `json:"seed" yaml:"seed"`
+	CFGScale       float32 `json:"cfg_scale" yaml:"cfg_scale"`
+	ResponseFormat string  `json:"response_format" yaml:"response_format"`
+}
+
 // @Description TTS request body
 type TTSRequest struct {
 	BasicModelRequest
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@ -202,7 +202,6 @@ type OpenAIRequest struct {

 	Backend string `json:"backend" yaml:"backend"`

-	// AutoGPTQ
 	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
 }

--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@ -41,8 +41,6 @@ type PredictionOptions struct {
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
 	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
-	// AutoGPTQ
-	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`

 	// Diffusers
 	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@ -268,14 +268,6 @@ yarn_ext_factor: 0
 yarn_attn_factor: 0
 yarn_beta_fast: 0
 yarn_beta_slow: 0
-
-# AutoGPT-Q settings, for configurations specific to GPT models.
-autogptq:
-    model_base_name: "" # Base name of the model.
-    device: "" # Device to run the model on.
-    triton: false # Whether to use Triton Inference Server.
-    use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
-
 # configuration for diffusers model
 diffusers:
    cuda: false # Whether to use CUDA
@ -489,8 +481,7 @@ In the help text below, BASEPATH is the location that local-ai is being executed
 |-----------|---------|-------------|----------------------|
 | --models-path | BASEPATH/models | Path containing models used for inferencing  | $LOCALAI_MODELS_PATH |
 | --backend-assets-path |/tmp/localai/backend_data | Path used to extract libraries that are required by some of the backends in runtime | $LOCALAI_BACKEND_ASSETS_PATH |
-| --image-path | /tmp/generated/images | Location for images generated by backends (e.g. stablediffusion) | $LOCALAI_IMAGE_PATH |
-| --audio-path | /tmp/generated/audio | Location for audio generated by backends (e.g. piper) | $LOCALAI_AUDIO_PATH |
+| --generated-content-path | /tmp/generated/content | Location for assets generated by backends (e.g. stablediffusion) | $LOCALAI_GENERATED_CONTENT_PATH |
 | --upload-path | /tmp/localai/upload | Path to store uploads from files api | $LOCALAI_UPLOAD_PATH |
 | --config-path | /tmp/localai/config | | $LOCALAI_CONFIG_PATH |
 | --localai-config-dir | BASEPATH/configuration | Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json) | $LOCALAI_CONFIG_DIR |
@ -523,6 +514,7 @@ In the help text below, BASEPATH is the location that local-ai is being executed
 | --upload-limit | 15 | Default upload-limit in MB | $LOCALAI_UPLOAD_LIMIT |
 | --api-keys | API-KEYS,... | List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys | $LOCALAI_API_KEY |
 | --disable-welcome |  | Disable welcome pages | $LOCALAI_DISABLE_WELCOME |
+| --disable-webui | false | Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface | $LOCALAI_DISABLE_WEBUI |
 | --machine-tag |  | If not empty - put that string to Machine-Tag header in each response. Useful to track response from different machines using multiple P2P federated nodes | $LOCALAI_MACHINE_TAG |

 #### Backend Flags
--- a/docs/content/docs/advanced/installer.md
+++ b/docs/content/docs/advanced/installer.md
@ -23,8 +23,9 @@ List of the Environment Variables:
 |----------------------|--------------------------------------------------------------|
 | **DOCKER_INSTALL**       | Set to "true" to enable the installation of Docker images.    |
 | **USE_AIO**              | Set to "true" to use the all-in-one LocalAI Docker image.    |
+| **USE_EXTRAS**           | Set to "true" to use images with extra Python dependencies.   |
+| **USE_VULKAN**           | Set to "true" to use Vulkan GPU support.                     |
 | **API_KEY**              | Specify an API key for accessing LocalAI, if required.       |
-| **CORE_IMAGES**          | Set to "true" to download core LocalAI images.                |
 | **PORT**                 | Specifies the port on which LocalAI will run (default is 8080). |
 | **THREADS**              | Number of processor threads the application should use. Defaults to the number of logical cores minus one. |
 | **VERSION**              | Specifies the version of LocalAI to install. Defaults to the latest available version. |
@ -34,4 +35,20 @@ List of the Environment Variables:
 | **FEDERATED** | Set to "true" to share the instance with the federation (p2p token is required see [documentation]({{%relref "docs/features/distributed_inferencing" %}}))  |
 | **FEDERATED_SERVER** | Set to "true" to run the instance as a federation server which forwards requests to the federation (p2p token is required see [documentation]({{%relref "docs/features/distributed_inferencing" %}}))  |

+## Image Selection
+
+The installer will automatically detect your GPU and select the appropriate image. By default, it uses the standard images without extra Python dependencies. You can customize the image selection using the following environment variables:
+
+- `USE_EXTRAS=true`: Use images with extra Python dependencies (larger images, ~17GB)
+- `USE_AIO=true`: Use all-in-one images that include all dependencies
+- `USE_VULKAN=true`: Use Vulkan GPU support instead of vendor-specific GPU support
+
+## Uninstallation
+
+To uninstall, run:
+
+```
+curl https://localai.io/install.sh | sh -s -- --uninstall
+```
+
 We are looking into improving the installer, and as this is a first iteration any feedback is welcome! Open up an [issue](https://github.com/mudler/LocalAI/issues/new/choose) if something doesn't work for you!
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@ -57,12 +57,14 @@ diffusers:

 Requirement: nvidia-container-toolkit (installation instructions [1](https://www.server-world.info/en/note?os=Ubuntu_22.04&p=nvidia&f=2) [2](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))

-To check what CUDA version do you need, you can either run `nvidia-smi` or `nvcc --version`. 
+If using a system with SELinux, ensure you have the policies installed, such as those [provided by nvidia](https://github.com/NVIDIA/dgx-selinux/)
+
+To check what CUDA version do you need, you can either run `nvidia-smi` or `nvcc --version`.

 Alternatively, you can also check nvidia-smi with docker:

 ```
-docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi
+docker run --runtime=nvidia --rm nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi
 ```

 To use CUDA, use the images with the `cublas` tag, for example.
@ -112,7 +114,7 @@ llama_init_from_file: kv self size  =  512.00 MB

 ## ROCM(AMD) acceleration

-There are a limited number of tested configurations for ROCm systems however most newer deditated GPU consumer grade devices seem to be supported under the current ROCm6 implementation. 
+There are a limited number of tested configurations for ROCm systems however most newer deditated GPU consumer grade devices seem to be supported under the current ROCm6 implementation.

 Due to the nature of ROCm it is best to run all implementations in containers as this limits the number of packages required for installation on host system, compatability and package versions for dependencies across all variations of OS must be tested independently if disired, please refer to the [build]({{%relref "docs/getting-started/build#Acceleration" %}}) documentation.

@ -137,7 +139,7 @@ LocalAI hipblas images are built against the following targets: gfx900,gfx906,gf

 If your device is not one of these you must specify the corresponding `GPU_TARGETS` and specify `REBUILD=true`. Otherwise you don't need to specify these in the commands below.

-### Verified 
+### Verified

 The devices in the following list have been tested with `hipblas` images running `ROCm 6.0.0`

@ -147,7 +149,6 @@ The devices in the following list have been tested with `hipblas` images running
 | diffusers | yes | Radeon VII (gfx906) |
 | piper | yes | Radeon VII (gfx906) |
 | whisper | no | none |
-| autogptq | no | none |
 | bark | no | none |
 | coqui | no | none |
 | transformers | no | none |
@ -166,7 +167,7 @@ The devices in the following list have been tested with `hipblas` images running
 1. Check your GPU LLVM target is compatible with the version of ROCm. This can be found in the [LLVM Docs](https://llvm.org/docs/AMDGPUUsage.html).
 2. Check which ROCm version is compatible with your LLVM target and your chosen OS (pay special attention to supported kernel versions). See the following for compatability for ([ROCm 6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/reference/system-requirements.html)) or ([ROCm 6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html))
 3. Install you chosen version of the `dkms` and `rocm` (it is recommended that the native package manager be used for this process for any OS as version changes are executed more easily via this method if updates are required). Take care to restart after installing `amdgpu-dkms` and before installing `rocm`, for details regarding this see the installation documentation for your chosen OS ([6.0.2](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/native-install/index.html) or [6.0.0](https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.0.0/how-to/native-install/index.html))
-4. Deploy. Yes it's that easy. 
+4. Deploy. Yes it's that easy.

 #### Setup Example (Docker/containerd)

@ -248,7 +249,7 @@ This configuration has been tested on a 'custom' cluster managed by SUSE Rancher

 - When installing the ROCM kernel driver on your system ensure that you are installing an equal or newer version that that which is currently implemented in LocalAI (6.0.0 at time of writing).
 - AMD documentation indicates that this will ensure functionality however your milage may vary depending on the GPU and distro you are using.
- If you encounter an `Error 413` on attempting to upload an audio file or image for whisper or llava/bakllava on a k8s deployment, note that the ingress for your deployment may require the annontation `nginx.ingress.kubernetes.io/proxy-body-size: "25m"` to allow larger uploads. This may be included in future versions of the helm chart. 
+- If you encounter an `Error 413` on attempting to upload an audio file or image for whisper or llava/bakllava on a k8s deployment, note that the ingress for your deployment may require the annontation `nginx.ingress.kubernetes.io/proxy-body-size: "25m"` to allow larger uploads. This may be included in future versions of the helm chart.

 ## Intel acceleration (sycl)

@ -279,3 +280,36 @@ docker run --rm -ti --device /dev/dri -p 8080:8080 -e DEBUG=true -e MODELS_PATH=
 ```

 Note also that sycl does have a known issue to hang with `mmap: true`. You have to disable it in the model configuration if explicitly enabled.
+
+## Vulkan acceleration
+
+### Requirements
+
+If using nvidia, follow the steps in the [CUDA](#cudanvidia-acceleration) section to configure your docker runtime to allow access to the GPU.
+
+### Container images
+
+To use Vulkan, use the images with the `vulkan` tag, for example `{{< version >}}-vulkan-ffmpeg-core`.
+
+#### Example
+
+To run LocalAI with Docker and Vulkan, you can use the following command as an example:
+
+```bash
+docker run -p 8080:8080 -e DEBUG=true -v $PWD/models:/build/models localai/localai:latest-vulkan-ffmpeg-core
+```
+
+### Notes
+
+In addition to the commands to run LocalAI normally, you need to specify additonal flags to pass the GPU hardware to the container.
+
+These flags are the same as the sections above, depending on the hardware, for [nvidia](#cudanvidia-acceleration), [AMD](#rocmamd-acceleration) or [Intel](#intel-acceleration-sycl).
+
+If you have mixed hardware, you can pass flags for multiple GPUs, for example:
+
+```bash
+docker run -p 8080:8080 -e DEBUG=true -v $PWD/models:/build/models \
+--gpus=all \ # nvidia passthrough
+--device /dev/dri --device /dev/kfd \ # AMD/Intel passthrough
+localai/localai:latest-vulkan-ffmpeg-core
+```
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@ -74,49 +74,9 @@ curl http://localhost:8080/v1/models

 ## Backends

-### AutoGPTQ
-
-[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is an easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.
-
-#### Prerequisites
-
-This is an extra backend - in the container images is already available and there is nothing to do for the setup.
-
-If you are building LocalAI locally, you need to install [AutoGPTQ manually](https://github.com/PanQiWei/AutoGPTQ#quick-installation).
-
-
-#### Model setup
-
-The models are automatically downloaded from `huggingface` if not present the first time. It is possible to define models via `YAML` config file, or just by querying the endpoint with the `huggingface` repository model name. For example, create a `YAML` config file in `models/`:
-
-```
-name: orca
-backend: autogptq
-model_base_name: "orca_mini_v2_13b-GPTQ-4bit-128g.no-act.order"
-parameters:
-  model: "TheBloke/orca_mini_v2_13b-GPTQ"
-# ...
-```
-
-Test with:
-
-```bash
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{                                                                                                         
-   "model": "orca",
-   "messages": [{"role": "user", "content": "How are you?"}],
-   "temperature": 0.1
- }'
-```
 ### RWKV

-A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv).
-
-Note: rwkv models needs to specify the backend `rwkv` in the YAML config files and have an associated tokenizer along that needs to be provided with it:
-
-```
-36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
-36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
-```
+RWKV support is available through llama.cpp (see below)

 ### llama.cpp

--- a/docs/content/docs/getting-started/build.md
+++ b/docs/content/docs/getting-started/build.md
@ -9,7 +9,7 @@ ico = "rocket_launch"

 ### Build

-LocalAI can be built as a container image or as a single, portable binary. Note that the some model architectures might require Python libraries, which are not included in the binary. The binary contains only the core backends written in Go and C++. 
+LocalAI can be built as a container image or as a single, portable binary. Note that some model architectures might require Python libraries, which are not included in the binary. The binary contains only the core backends written in Go and C++. 

 LocalAI's extensible architecture allows you to add your own backends, which can be written in any language, and as such the container images contains also the Python dependencies to run all the available backends (for example, in order to run backends like __Diffusers__ that allows to generate images and videos from text).

@ -189,7 +189,7 @@ sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer

 - If completions are slow, ensure that `gpu-layers` in your model yaml matches the number of layers from the model in use (or simply use a high number such as 256).

- If you a get a compile error: `error: only virtual member functions can be marked 'final'`, reinstall all the necessary brew packages, clean the build, and try again.
+- If you get a compile error: `error: only virtual member functions can be marked 'final'`, reinstall all the necessary brew packages, clean the build, and try again.

 ```
 # reinstall build dependencies
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@ -39,7 +39,7 @@ Before you begin, ensure you have a container engine installed if you are not us

 ## All-in-one images

-All-In-One images are images that come pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and requires no configuration. Models configuration can be found [here](https://github.com/mudler/LocalAI/tree/master/aio) separated by size.
+All-In-One images are images that come pre-configured with a set of models and backends to fully leverage almost all the LocalAI featureset. These images are available for both CPU and GPU environments. The AIO images are designed to be easy to use and require no configuration. Models configuration can be found [here](https://github.com/mudler/LocalAI/tree/master/aio) separated by size.

 In the AIO images there are models configured with the names of OpenAI models, however, they are really backed by Open Source models. You can find the table below

@ -150,7 +150,7 @@ The AIO Images are inheriting the same environment variables as the base images

 Standard container images do not have pre-installed models. 

-Images are available with and without python dependencies. Note that images with python dependencies are bigger (in order of 17GB). 
+Images are available with and without python dependencies (images with the `extras` suffix). Note that images with python dependencies are bigger (in order of 17GB). 

 Images with `core` in the tag are smaller and do not contain any python dependencies. 

@ -160,10 +160,8 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | Description | Quay | Docker Hub                                   |
 | --- | --- |-----------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master` | `localai/localai:master`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-cpu`                  | `localai/localai:latest-cpu`                  |
+| Latest tag | `quay.io/go-skynet/local-ai:latest` | `localai/localai:latest`                  |
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}` | `localai/localai:{{< version >}}`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg` | `localai/localai:{{< version >}}-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-ffmpeg-core` | `localai/localai:{{< version >}}-ffmpeg-core` |

 {{% /tab %}}

@ -172,10 +170,9 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda11` | `localai/localai:master-cublas-cuda11`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-11`                 | `localai/localai:latest-gpu-nvidia-cuda-11`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-11` | `localai/localai:latest-gpu-nvidia-cuda-11`                      |
+| Latest tag with extras | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-11-extras` | `localai/localai:latest-gpu-nvidia-cuda-11-extras`                      |
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11` | `localai/localai:{{< version >}}-cublas-cuda11`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda11-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda11-ffmpeg-core` |

 {{% /tab %}}

@ -185,9 +182,8 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-cublas-cuda12` | `localai/localai:master-cublas-cuda12`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-12` | `localai/localai:latest-gpu-nvidia-cuda-12`                 |
+| Latest tag with extras | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-12-extras` | `localai/localai:latest-gpu-nvidia-cuda-12-extras`                 |
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12` | `localai/localai:{{< version >}}-cublas-cuda12`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-cublas-cuda12-ffmpeg-core` | `localai/localai:{{< version >}}-cublas-cuda12-ffmpeg-core` |

 {{% /tab %}}

@ -197,9 +193,8 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-sycl-f16` | `localai/localai:master-sycl-f16`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-intel-f16` | `localai/localai:latest-gpu-intel-f16`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-core` | `localai/localai:{{< version >}}-sycl-f16-core`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-ffmpeg` | `localai/localai:{{< version >}}-sycl-f16-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16-ffmpeg-core` | `localai/localai:{{< version >}}-sycl-f16-ffmpeg-core` |
+| Latest tag with extras | `quay.io/go-skynet/local-ai:latest-gpu-intel-f16-extras` | `localai/localai:latest-gpu-intel-f16-extras`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f16` | `localai/localai:{{< version >}}-sycl-f16`             |

 {{% /tab %}}

@ -209,9 +204,8 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-sycl-f32` | `localai/localai:master-sycl-f32`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-intel-f32` | `localai/localai:latest-gpu-intel-f32`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32-core` | `localai/localai:{{< version >}}-sycl-f32-core`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32-ffmpeg` | `localai/localai:{{< version >}}-sycl-f32-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32-ffmpeg-core` | `localai/localai:{{< version >}}-sycl-f32-ffmpeg-core` |
+| Latest tag with extras | `quay.io/go-skynet/local-ai:latest-gpu-intel-f32-extras` | `localai/localai:latest-gpu-intel-f32-extras`                      |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-sycl-f32` | `localai/localai:{{< version >}}-sycl-f32`             |

 {{% /tab %}}

@ -220,20 +214,18 @@ Images with `core` in the tag are smaller and do not contain any python dependen
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
 | Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-hipblas` | `localai/localai:master-hipblas`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-hipblas`                  | `localai/localai:latest-gpu-hipblas`                  |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-hipblas` | `localai/localai:latest-gpu-hipblas`                      |
+| Latest tag with extras | `quay.io/go-skynet/local-ai:latest-gpu-hipblas-extras` | `localai/localai:latest-gpu-hipblas-extras`                      |
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-hipblas` | `localai/localai:{{< version >}}-hipblas`             |
-| Versioned image including FFMpeg| `quay.io/go-skynet/local-ai:{{< version >}}-hipblas-ffmpeg` | `localai/localai:{{< version >}}-hipblas-ffmpeg`      |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-hipblas-ffmpeg-core` | `localai/localai:{{< version >}}-hipblas-ffmpeg-core` |

 {{% /tab %}}

-
 {{% tab tabName="Vulkan Images" %}}
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai: master-vulkan-ffmpeg-core ` | `localai/localai: master-vulkan-ffmpeg-core `                      |
-| Latest tag | `quay.io/go-skynet/local-ai: latest-vulkan-ffmpeg-core ` | `localai/localai: latest-vulkan-ffmpeg-core`                 |
-| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core`             |
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-vulkan` | `localai/localai:master-vulkan`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-vulkan` | `localai/localai:latest-gpu-vulkan`                 |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan` | `localai/localai:{{< version >}}-vulkan`             |
 {{% /tab %}}

 {{% tab tabName="Nvidia Linux for tegra" %}}
@ -242,9 +234,9 @@ These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano,

 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core` | `localai/localai:master-nvidia-l4t-arm64-core`                      |
-| Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-core` | `localai/localai:latest-nvidia-l4t-arm64-core`                 |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64-core` | `localai/localai:{{< version >}}-nvidia-l4t-arm64-core`             |
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64` | `localai/localai:master-nvidia-l4t-arm64`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64` | `localai/localai:latest-nvidia-l4t-arm64`                 |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64` | `localai/localai:{{< version >}}-nvidia-l4t-arm64`             |

 {{% /tab %}}

--- a/docs/content/docs/getting-started/kubernetes.md
+++ b/docs/content/docs/getting-started/kubernetes.md
@ -7,7 +7,7 @@ ico = "rocket_launch"
 +++


-For installing LocalAI in Kubernetes, the deployment file from the `examples` can be used and customized as prefered:
+For installing LocalAI in Kubernetes, the deployment file from the `examples` can be used and customized as preferred:

 ```
 kubectl apply -f https://raw.githubusercontent.com/mudler/LocalAI-examples/refs/heads/main/kubernetes/deployment.yaml
@ -29,7 +29,7 @@ helm repo update
 # Get the values
 helm show values go-skynet/local-ai > values.yaml

-# Edit the values value if needed
+# Edit the values if needed
 # vim values.yaml ...

 # Install the helm chart
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@ -14,18 +14,19 @@ icon = "rocket_launch"

 If you are exposing LocalAI remotely, make sure you protect the API endpoints adequately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.

-To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
-
 {{% /alert %}}

 ## Quickstart

-
 ### Using the Bash Installer
+
 ```bash
+# Basic installation
 curl https://localai.io/install.sh | sh
 ```

+See [Installer]({{% relref "docs/advanced/installer" %}}) for all the supported options
+
 ### Run with docker:
 ```bash
 # CPU only image:
@ -100,6 +101,57 @@ The AIO images come pre-configured with the following features:

 For instructions on using AIO images, see [Using container images]({{% relref "docs/getting-started/container-images#all-in-one-images" %}}).

+## Using LocalAI and the full stack with LocalAGI
+
+LocalAI is part of the Local family stack, along with LocalAGI and LocalRecall.
+
+[LocalAGI](https://github.com/mudler/LocalAGI) is a powerful, self-hostable AI Agent platform designed for maximum privacy and flexibility which encompassess and uses all the softwre stack. It provides a complete drop-in replacement for OpenAI's Responses APIs with advanced agentic capabilities, working entirely locally on consumer-grade hardware (CPU and GPU).
+
+### Quick Start
+
+```bash
+# Clone the repository
+git clone https://github.com/mudler/LocalAGI
+cd LocalAGI
+
+# CPU setup (default)
+docker compose up
+
+# NVIDIA GPU setup
+docker compose -f docker-compose.nvidia.yaml up
+
+# Intel GPU setup (for Intel Arc and integrated GPUs)
+docker compose -f docker-compose.intel.yaml up
+
+# Start with a specific model (see available models in models.localai.io, or localai.io to use any model in huggingface)
+MODEL_NAME=gemma-3-12b-it docker compose up
+
+# NVIDIA GPU setup with custom multimodal and image models
+MODEL_NAME=gemma-3-12b-it \
+MULTIMODAL_MODEL=minicpm-v-2_6 \
+IMAGE_MODEL=flux.1-dev-ggml \
+docker compose -f docker-compose.nvidia.yaml up
+```
+
+### Key Features
+
+- **Privacy-Focused**: All processing happens locally, ensuring your data never leaves your machine
+- **Flexible Deployment**: Supports CPU, NVIDIA GPU, and Intel GPU configurations
+- **Multiple Model Support**: Compatible with various models from Hugging Face and other sources
+- **Web Interface**: User-friendly chat interface for interacting with AI agents
+- **Advanced Capabilities**: Supports multimodal models, image generation, and more
+- **Docker Integration**: Easy deployment using Docker Compose
+
+### Environment Variables
+
+You can customize your LocalAGI setup using the following environment variables:
+
+- `MODEL_NAME`: Specify the model to use (e.g., `gemma-3-12b-it`)
+- `MULTIMODAL_MODEL`: Set a custom multimodal model
+- `IMAGE_MODEL`: Configure an image generation model
+
+For more advanced configuration and API documentation, visit the [LocalAGI GitHub repository](https://github.com/mudler/LocalAGI).
+
 ## What's Next?

 There is much more to explore with LocalAI! You can run any model from Hugging Face, perform video generation, and also voice cloning. For a comprehensive overview, check out the [features]({{% relref "docs/features" %}}) section.
--- a/docs/data/version.json
+++ b/docs/data/version.json
@ -1,3 +1,3 @@
 {
-  "version": "v2.27.0"
+  "version": "v2.29.0"
 }
--- a/docs/static/install.sh
+++ b/docs/static/install.sh
@ -1,30 +1,76 @@
 #!/bin/sh
-# This script installs LocalAI on Linux.
-# It detects the current operating system architecture and installs the appropriate version of LocalAI.
+# LocalAI Installer Script
+# This script installs LocalAI on Linux and macOS systems.
+# It automatically detects the system architecture and installs the appropriate version.

 # Usage:
-#   curl ... | ENV_VAR=... sh -
-#       or
-#   ENV_VAR=... ./install.sh
+#   Basic installation:
+#     curl https://localai.io/install.sh | sh
+#
+#   With environment variables:
+#     DOCKER_INSTALL=true USE_AIO=true API_KEY=your-key PORT=8080 THREADS=4 curl https://localai.io/install.sh | sh
+#
+#   To uninstall:
+#     curl https://localai.io/install.sh | sh -s -- --uninstall
+#
+# Environment Variables:
+#   DOCKER_INSTALL - Set to "true" to install Docker images (default: auto-detected)
+#   USE_AIO       - Set to "true" to use the all-in-one LocalAI image (default: false)
+#   USE_EXTRAS    - Set to "true" to use images with extra Python dependencies (default: false)
+#   USE_VULKAN    - Set to "true" to use Vulkan GPU support (default: false)
+#   API_KEY       - API key for securing LocalAI access (default: none)
+#   PORT          - Port to run LocalAI on (default: 8080)
+#   THREADS       - Number of CPU threads to use (default: auto-detected)
+#   MODELS_PATH   - Path to store models (default: /usr/share/local-ai/models)
+#   CORE_IMAGES   - Set to "true" to download core LocalAI images (default: false)
+#   P2P_TOKEN     - Token for P2P federation/worker mode (default: none)
+#   WORKER        - Set to "true" to run as a worker node (default: false)
+#   FEDERATED     - Set to "true" to enable federation mode (default: false)
+#   FEDERATED_SERVER - Set to "true" to run as a federation server (default: false)

 set -e
 set -o noglob
 #set -x

 # --- helper functions for logs ---
+# ANSI escape codes
+LIGHT_BLUE='\033[38;5;117m'
+ORANGE='\033[38;5;214m'
+RED='\033[38;5;196m'
+BOLD='\033[1m'
+RESET='\033[0m'
+
 info()
 {
-    echo ' ' "$@"
+    echo -e "${BOLD}${LIGHT_BLUE}" '[INFO] ' "$@" "${RESET}"
 }

 warn()
 {
-    echo '[WARN] ' "$@" >&2
+    echo -e "${BOLD}${ORANGE}" '[WARN] ' "$@" "${RESET}" >&2
 }

 fatal()
 {
-    echo '[ERROR] ' "$@" >&2
+    echo -e "${BOLD}${RED}" '[ERROR] ' "$@" "${RESET}" >&2
+    exit 1
+}
+
+# --- custom choice functions ---
+# like the logging functions, but with the -n flag to prevent the new line and keep the cursor in line for choices inputs like y/n
+choice_info()
+{
+    echo -e -n "${BOLD}${LIGHT_BLUE}" '[INFO] ' "$@" "${RESET}"
+}
+
+choice_warn()
+{
+    echo -e -n "${BOLD}${ORANGE}" '[WARN] ' "$@" "${RESET}" >&2
+}
+
+choice_fatal()
+{
+    echo -e -n "${BOLD}${RED}" '[ERROR] ' "$@" "${RESET}" >&2
    exit 1
 }

@ -57,10 +103,65 @@ require() {
    echo $MISSING
 }

+# Function to uninstall LocalAI
+uninstall_localai() {
+    info "Starting LocalAI uninstallation..."
+
+    # Stop and remove Docker container if it exists
+    if available docker && $SUDO docker ps -a --format '{{.Names}}' | grep -q local-ai; then
+        info "Stopping and removing LocalAI Docker container..."
+        $SUDO docker stop local-ai || true
+        $SUDO docker rm local-ai || true
+        $SUDO docker volume rm local-ai-data || true
+    fi
+
+    # Remove systemd service if it exists
+    if [ -f "/etc/systemd/system/local-ai.service" ]; then
+        info "Removing systemd service..."
+        $SUDO systemctl stop local-ai || true
+        $SUDO systemctl disable local-ai || true
+        $SUDO rm -f /etc/systemd/system/local-ai.service
+        $SUDO systemctl daemon-reload
+    fi
+
+    # Remove environment file
+    if [ -f "/etc/localai.env" ]; then
+        info "Removing environment file..."
+        $SUDO rm -f /etc/localai.env
+    fi
+
+    # Remove binary
+    for BINDIR in /usr/local/bin /usr/bin /bin; do
+        if [ -f "$BINDIR/local-ai" ]; then
+            info "Removing binary from $BINDIR..."
+            $SUDO rm -f "$BINDIR/local-ai"
+        fi
+    done
+
+    # Remove models directory
+    if [ -d "/usr/share/local-ai" ]; then
+        info "Removing LocalAI data directory..."
+        $SUDO rm -rf /usr/share/local-ai
+    fi
+
+    # Remove local-ai user if it exists
+    if id local-ai >/dev/null 2>&1; then
+        info "Removing local-ai user..."
+        $SUDO userdel -r local-ai || true
+    fi
+
+    info "LocalAI has been successfully uninstalled."
+    exit 0
+}
+
+
+
 ## VARIABLES

 # DOCKER_INSTALL - set to "true" to install Docker images
 # USE_AIO - set to "true" to install the all-in-one LocalAI image
+# USE_EXTRAS - set to "true" to use images with extra Python dependencies
+# USE_VULKAN - set to "true" to use Vulkan GPU support
 PORT=${PORT:-8080}

 docker_found=false
@ -74,6 +175,8 @@ fi

 DOCKER_INSTALL=${DOCKER_INSTALL:-$docker_found}
 USE_AIO=${USE_AIO:-false}
+USE_EXTRAS=${USE_EXTRAS:-false}
+USE_VULKAN=${USE_VULKAN:-false}
 API_KEY=${API_KEY:-}
 CORE_IMAGES=${CORE_IMAGES:-false}
 P2P_TOKEN=${P2P_TOKEN:-}
@ -89,7 +192,7 @@ else
 fi
 THREADS=${THREADS:-$procs}
 LATEST_VERSION=$(curl -s "https://api.github.com/repos/mudler/LocalAI/releases/latest" | grep '"tag_name":' | sed -E 's/.*"([^"]+)".*/\1/')
-VERSION="${VERSION:-$LATEST_VERSION}"
+LOCALAI_VERSION="${LOCALAI_VERSION:-$LATEST_VERSION}" #changed due to VERSION beign already defined in Fedora 42 Cloud Edition
 MODELS_PATH=${MODELS_PATH:-/usr/share/local-ai/models}


@ -156,7 +259,7 @@ WorkingDirectory=/usr/share/local-ai
 [Install]
 WantedBy=default.target
 EOF
-    
+
    $SUDO touch /etc/localai.env
    $SUDO echo "ADDRESS=0.0.0.0:$PORT" | $SUDO tee /etc/localai.env >/dev/null
    $SUDO echo "API_KEY=$API_KEY" | $SUDO tee -a /etc/localai.env >/dev/null
@ -189,23 +292,74 @@ EOF

 # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-yum-or-dnf
 install_container_toolkit_yum() {
-    info 'Installing NVIDIA repository...'
+    info 'Installing NVIDIA container toolkit repository...'

    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
    $SUDO  tee /etc/yum.repos.d/nvidia-container-toolkit.repo

    if [ "$PACKAGE_MANAGER" = "dnf" ]; then
-        $SUDO $PACKAGE_MANAGER config-manager --enable nvidia-container-toolkit-experimental
-    else 
+        DNF_VERSION=$($PACKAGE_MANAGER --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -n1 | cut -d. -f1)
+        if [ "$DNF_VERSION" -ge 5 ]; then
+            # DNF5: Use 'setopt' to enable the repository
+            $SUDO $PACKAGE_MANAGER config-manager setopt nvidia-container-toolkit-experimental.enabled=1
+        else
+            # DNF4: Use '--set-enabled' to enable the repository
+            $SUDO $PACKAGE_MANAGER config-manager --enable nvidia-container-toolkit-experimental
+        fi
+    else
        $SUDO $PACKAGE_MANAGER -y install yum-utils
        $SUDO $PACKAGE_MANAGER-config-manager --enable nvidia-container-toolkit-experimental
    fi
    $SUDO $PACKAGE_MANAGER install -y nvidia-container-toolkit
 }

+# Fedora, Rhel and other distro ships tunable SELinux booleans in the container-selinux policy to control device access.
+# In particular, enabling container_use_devices allows containers to use arbitrary host device labels (including GPU devices)
+# ref: https://github.com/containers/ramalama/blob/main/docs/ramalama-cuda.7.md#expected-output
+enable_selinux_container_booleans() {
+
+    # Check SELinux mode
+    SELINUX_MODE=$(getenforce)
+
+    if [ "$SELINUX_MODE" == "Enforcing" ]; then
+        # Check the status of container_use_devices
+        CONTAINER_USE_DEVICES=$(getsebool container_use_devices | awk '{print $3}')
+
+       if [ "$CONTAINER_USE_DEVICES" == "off" ]; then
+
+          #We want to give the user the choice to enable the SE booleans since it is a security config
+          warn "+-----------------------------------------------------------------------------------------------------------+"
+          warn "| WARNING:                                                                                                  |"
+          warn "| Your distribution ships tunable SELinux booleans in the container-selinux policy to control device access.|"
+          warn "| In particular, enabling \"container_use_devices\" allows containers to use arbitrary host device labels   |"
+          warn "| (including GPU devices).                                                                                  |"
+          warn "| This script can try to enable them enabling the \"container_use_devices\" flag.                           |"
+          warn "|                                                                                                           |"
+          warn "| Otherwise you can exit the install script and enable them yourself.                                       |"
+          warn "+-----------------------------------------------------------------------------------------------------------+"
+
+          while true; do
+              choice_warn "I understand that this script is going to change my SELinux configs, which is a security risk: (yes/exit) ";
+              read  Answer
+
+              if [ "$Answer" = "yes" ]; then
+                warn "Enabling \"container_use_devices\" persistently..."
+                $SUDO setsebool -P container_use_devices 1
+
+                break
+              elif [ "$Answer" = "exit" ]; then
+                  aborted
+              else
+                  warn "Invalid choice. Please enter 'yes' or 'exit'."
+              fi
+            done
+       fi
+    fi
+}
+
 # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt
 install_container_toolkit_apt() {
-    info 'Installing NVIDIA repository...'
+    info 'Installing NVIDIA container toolkit repository...'

    curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | $SUDO gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
  && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
@ -217,7 +371,7 @@ install_container_toolkit_apt() {

 # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-zypper
 install_container_toolkit_zypper() {
-    info 'Installing NVIDIA repository...'
+    info 'Installing NVIDIA zypper repository...'
    $SUDO zypper ar https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
    $SUDO zypper modifyrepo --enable nvidia-container-toolkit-experimental
    $SUDO zypper --gpg-auto-import-keys install -y nvidia-container-toolkit
@ -246,6 +400,29 @@ install_container_toolkit() {
            opensuse*|suse*) install_container_toolkit_zypper ;;
            *) echo "Could not install nvidia container toolkit - unknown OS" ;;
    esac
+
+    # after installing the toolkit we need to add it to the docker runtimes, otherwise even with --gpu all
+    # the container would still run with runc and would not have access to nvidia-smi
+    # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker
+    info "Adding NVIDIA Container Runtime to Docker runtimes..."
+    $SUDO nvidia-ctk runtime configure --runtime=docker
+
+    info "Restarting Docker Daemon"
+    $SUDO systemctl restart docker
+
+    # The NVML error arises because SELinux blocked the container's attempts to open the GPU devices or related libraries.
+    # Without relaxing SELinux for the container, GPU commands like nvidia-smi report "Insufficient Permissions"
+    # This has been noted in NVIDIA's documentation:
+    # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html#id2
+    # ref: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/troubleshooting.html#nvml-insufficient-permissions-and-selinux
+    case $OS_NAME in
+            fedora|rhel|centos|rocky)
+                enable_selinux_container_booleans
+                ;;
+            opensuse-tumbleweed)
+                enable_selinux_container_booleans
+                ;;
+    esac
 }

 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
@ -253,14 +430,21 @@ install_container_toolkit() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
-    info 'Installing NVIDIA repository...'
+    info 'Installing NVIDIA CUDA repository...'
    case $PACKAGE_MANAGER in
        yum)
            $SUDO $PACKAGE_MANAGER -y install yum-utils
            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
            ;;
        dnf)
-            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            DNF_VERSION=$($PACKAGE_MANAGER --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -n1 | cut -d. -f1)
+            if [ "$DNF_VERSION" -ge 5 ]; then
+                # DNF5: Use 'addrepo' to add the repository
+                $SUDO $PACKAGE_MANAGER config-manager addrepo --id=nvidia-cuda --set=name="nvidia-cuda" --set=baseurl="https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo"
+            else
+                # DNF4: Use '--add-repo' to add the repository
+                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            fi
            ;;
    esac

@ -281,10 +465,68 @@ install_cuda_driver_yum() {
    $SUDO $PACKAGE_MANAGER -y install cuda-drivers
 }

+install_fedora_nvidia_kernel_drivers(){
+
+  #We want to give the user the choice to install the akmod kernel drivers or not, since it could break some setups
+  warn "+------------------------------------------------------------------------------------------------+"
+  warn "| WARNING:                                                                                       |"
+  warn "| Looks like the NVIDIA Kernel modules are not installed.                                        |"
+  warn "|                                                                                                |"
+  warn "| This script can try to install them using akmod-nvidia.                                        |"
+  warn "| - The script need the rpmfusion free and nonfree repos and will install them if not available. |"
+  warn "| - The akmod installation can sometimes inhibit the reboot command.                             |"
+  warn "|                                                                                                |"
+  warn "| Otherwise you can exit the install script and install them yourself.                           |"
+  warn "| NOTE: you will need to reboot after the installation.                                          |"
+  warn "+------------------------------------------------------------------------------------------------+"
+
+  while true; do
+    choice_warn "Do you wish for the script to try and install them? (akmod/exit) ";
+    read  Answer
+
+    if [ "$Answer" = "akmod" ]; then
+
+      DNF_VERSION=$($PACKAGE_MANAGER --version | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -n1 | cut -d. -f1)
+
+      OS_NAME=$ID
+      OS_VERSION=$VERSION_ID
+      FREE_URL="https://mirrors.rpmfusion.org/free/fedora/rpmfusion-free-release-${OS_VERSION}.noarch.rpm"
+      NONFREE_URL="https://mirrors.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-${OS_VERSION}.noarch.rpm"
+
+      curl -LO "$FREE_URL"
+      curl -LO "$NONFREE_URL"
+
+      if [ "$DNF_VERSION" -ge 5 ]; then
+          # DNF5:
+          $SUDO $PACKAGE_MANAGER install -y "rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm" "rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm"
+          $SUDO $PACKAGE_MANAGER install -y akmod-nvidia
+      else
+          # DNF4:
+          $SUDO $PACKAGE_MANAGER install -y "rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm" "rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm"
+          $SUDO $PACKAGE_MANAGER install -y akmod-nvidia
+      fi
+
+      $SUDO rm "rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm"
+      $SUDO rm "rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm"
+
+      install_cuda_driver_yum $OS_NAME '41'
+
+      info "Nvidia driver installation complete, please reboot now and run the Install script again to complete the setup."
+      exit
+
+    elif [ "$Answer" = "exit" ]; then
+
+        aborted
+    else
+        warn "Invalid choice. Please enter 'akmod' or 'exit'."
+    fi
+  done
+}
+
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#ubuntu
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
-    info 'Installing NVIDIA repository...'
+    info 'Installing NVIDIA CUDA repository...'
    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb

    case $1 in
@ -323,7 +565,7 @@ install_cuda() {
        case $OS_NAME in
            centos|rhel) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -d '.' -f 1) ;;
            rocky) install_cuda_driver_yum 'rhel' $(echo $OS_VERSION | cut -c1) ;;
-            fedora) [ $OS_VERSION -lt '37' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '37';;
+            fedora) [ $OS_VERSION -lt '41' ] && install_cuda_driver_yum $OS_NAME $OS_VERSION || install_cuda_driver_yum $OS_NAME '41';;
            amzn) install_cuda_driver_yum 'fedora' '37' ;;
            debian) install_cuda_driver_apt $OS_NAME $OS_VERSION ;;
            ubuntu) install_cuda_driver_apt $OS_NAME $(echo $OS_VERSION | sed 's/\.//') ;;
@ -399,25 +641,16 @@ install_docker() {
        $SUDO systemctl start docker
    fi

-    info "Starting LocalAI Docker container..."
+    info "Creating LocalAI Docker volume..."
    # Create volume if doesn't exist already
    if ! $SUDO docker volume inspect local-ai-data > /dev/null 2>&1; then
        $SUDO docker volume create local-ai-data
    fi

-    # Check if container is already runnning
+    # Check if container is already running
    if $SUDO docker ps -a --format '{{.Names}}' | grep -q local-ai; then
        info "LocalAI Docker container already exists, replacing it..."
        $SUDO docker rm -f local-ai
-        # # Check if it is running
-        # if $SUDO docker ps --format '{{.Names}}' | grep -q local-ai; then
-        #     info "LocalAI Docker container is already running."
-        #     exit 0
-        # fi 
-
-        # info "Starting LocalAI Docker container..."
-        # $SUDO docker start local-ai
-        # exit 0
    fi

    envs=""
@ -429,23 +662,41 @@ install_docker() {
    fi

    IMAGE_TAG=
-    if [ "$HAS_CUDA" ]; then
-        IMAGE_TAG=${VERSION}-cublas-cuda12-ffmpeg
-        # CORE
-        if [ "$CORE_IMAGES" = true ]; then
-            IMAGE_TAG=${VERSION}-cublas-cuda12-ffmpeg-core
+    if [ "$USE_VULKAN" = true ]; then
+        IMAGE_TAG=${LOCALAI_VERSION}-gpu-vulkan
+
+        info "Starting LocalAI Docker container..."
+        $SUDO docker run -v local-ai-data:/build/models \
+            --device /dev/dri \
+            --restart=always \
+            -e API_KEY=$API_KEY \
+            -e THREADS=$THREADS \
+            $envs \
+            -d -p $PORT:8080 --name local-ai localai/localai:$IMAGE_TAG $STARTCOMMAND
+    elif [ "$HAS_CUDA" ]; then
+        # Default to CUDA 12
+        IMAGE_TAG=${LOCALAI_VERSION}-cublas-cuda12
+        # EXTRAS
+        if [ "$USE_EXTRAS" = true ]; then
+            IMAGE_TAG=${LOCALAI_VERSION}-cublas-cuda12-extras
        fi
        # AIO
        if [ "$USE_AIO" = true ]; then
-            IMAGE_TAG=${VERSION}-aio-gpu-nvidia-cuda-12
+            IMAGE_TAG=${LOCALAI_VERSION}-aio-gpu-nvidia-cuda-12
        fi

+        info "Checking Nvidia Kernel Drivers presence..."
        if ! available nvidia-smi; then
-            info "Installing nvidia-cuda-toolkit..."
-            # TODO:
-            $SUDO apt-get -y install nvidia-cuda-toolkit
+          OS_NAME=$ID
+          OS_VERSION=$VERSION_ID
+
+            case $OS_NAME in
+                debian|ubuntu) $SUDO apt-get -y install nvidia-cuda-toolkit;;
+                fedora) install_fedora_nvidia_kernel_drivers;;
+            esac
        fi

+        info "Starting LocalAI Docker container..."
        $SUDO docker run -v local-ai-data:/build/models \
            --gpus all \
            --restart=always \
@ -454,35 +705,39 @@ install_docker() {
            $envs \
            -d -p $PORT:8080 --name local-ai localai/localai:$IMAGE_TAG $STARTCOMMAND
    elif [ "$HAS_AMD" ]; then
-        IMAGE_TAG=${VERSION}-hipblas-ffmpeg
-        # CORE
-        if [ "$CORE_IMAGES" = true ]; then
-            IMAGE_TAG=${VERSION}-hipblas-ffmpeg-core
+        IMAGE_TAG=${LOCALAI_VERSION}-hipblas
+        # EXTRAS
+        if [ "$USE_EXTRAS" = true ]; then
+            IMAGE_TAG=${LOCALAI_VERSION}-hipblas-extras
        fi
        # AIO
        if [ "$USE_AIO" = true ]; then
-            IMAGE_TAG=${VERSION}-aio-gpu-hipblas
+            IMAGE_TAG=${LOCALAI_VERSION}-aio-gpu-hipblas
        fi

+        info "Starting LocalAI Docker container..."
        $SUDO docker run -v local-ai-data:/build/models \
            --device /dev/dri \
            --device /dev/kfd \
+            --group-add=video \
            --restart=always \
            -e API_KEY=$API_KEY \
            -e THREADS=$THREADS \
            $envs \
            -d -p $PORT:8080 --name local-ai localai/localai:$IMAGE_TAG $STARTCOMMAND
    elif [ "$HAS_INTEL" ]; then
-        IMAGE_TAG=${VERSION}-sycl-f32-ffmpeg
-        # CORE
-        if [ "$CORE_IMAGES" = true ]; then
-            IMAGE_TAG=${VERSION}-sycl-f32-ffmpeg-core
+        # Default to FP32 for better compatibility
+        IMAGE_TAG=${LOCALAI_VERSION}-sycl-f32
+        # EXTRAS
+        if [ "$USE_EXTRAS" = true ]; then
+            IMAGE_TAG=${LOCALAI_VERSION}-sycl-f32-extras
        fi
        # AIO
        if [ "$USE_AIO" = true ]; then
-            IMAGE_TAG=${VERSION}-aio-gpu-intel-f32
+            IMAGE_TAG=${LOCALAI_VERSION}-aio-gpu-intel-f32
        fi

+        info "Starting LocalAI Docker container..."
        $SUDO docker run -v local-ai-data:/build/models \
            --device /dev/dri \
            --restart=always \
@ -490,16 +745,16 @@ install_docker() {
            -e THREADS=$THREADS \
            $envs \
            -d -p $PORT:8080 --name local-ai localai/localai:$IMAGE_TAG $STARTCOMMAND
+
    else
-        IMAGE_TAG=${VERSION}-ffmpeg
-        # CORE
-        if [ "$CORE_IMAGES" = true ]; then
-            IMAGE_TAG=${VERSION}-ffmpeg-core
-        fi
+        IMAGE_TAG=${LOCALAI_VERSION}
+
        # AIO
        if [ "$USE_AIO" = true ]; then
-            IMAGE_TAG=${VERSION}-aio-cpu
-        fi        
+            IMAGE_TAG=${LOCALAI_VERSION}-aio-cpu
+        fi
+
+        info "Starting LocalAI Docker container..."
        $SUDO docker run -v local-ai-data:/models \
                --restart=always \
                -e MODELS_PATH=/models \
@ -516,10 +771,10 @@ install_docker() {
 install_binary_darwin() {
    [ "$(uname -s)" = "Darwin" ] || fatal 'This script is intended to run on macOS only.'

-    info "Downloading local-ai..."
-    curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Darwin-${ARCH}"
+    info "Downloading LocalAI ${LOCALAI_VERSION}..."
+    curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-Darwin-${ARCH}"

-    info "Installing local-ai..."
+    info "Installing to /usr/local/bin/local-ai"
    install -o0 -g0 -m755 $TEMP_DIR/local-ai /usr/local/bin/local-ai

    install_success
@ -548,14 +803,14 @@ install_binary() {
        exit 1
    fi

-    info "Downloading local-ai..."
-    curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Linux-${ARCH}"
+    info "Downloading LocalAI ${LOCALAI_VERSION}..."
+    curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-Linux-${ARCH}"

    for BINDIR in /usr/local/bin /usr/bin /bin; do
        echo $PATH | grep -q $BINDIR && break || continue
    done

-    info "Installing local-ai to $BINDIR..."
+    info "Installing LocalAI as local-ai to $BINDIR..."
    $SUDO install -o0 -g0 -m755 -d $BINDIR
    $SUDO install -o0 -g0 -m755 $TEMP_DIR/local-ai $BINDIR/local-ai

@ -603,7 +858,7 @@ detect_start_command() {
    if [ "$WORKER" = true ]; then
        if [ -n "$P2P_TOKEN" ]; then
            STARTCOMMAND="worker p2p-llama-cpp-rpc"
-        else 
+        else
            STARTCOMMAND="worker llama-cpp-rpc"
        fi
    elif [ "$FEDERATED" = true ]; then
@ -617,6 +872,10 @@ detect_start_command() {
    fi
 }

+# Check if uninstall flag is provided
+if [ "$1" = "--uninstall" ]; then
+    uninstall_localai
+fi

 detect_start_command

@ -664,10 +923,12 @@ for PACKAGE_MANAGER in dnf yum apt-get; do
 done

 if [ "$DOCKER_INSTALL" = "true" ]; then
+    info "Installing LocalAI from container images"
    if [ "$HAS_CUDA" = true ]; then
        install_container_toolkit
    fi
    install_docker
 else
+    info "Installing LocalAI from binaries"
    install_binary
 fi
--- a/gallery/flux-ggml.yaml
+++ b/gallery/flux-ggml.yaml
@ -10,3 +10,5 @@ config_file: |
    - "t5xxl_path:t5xxl_fp16.safetensors"
    - "vae_path:ae.safetensors"
    - "sampler:euler"
+
+    cfg_scale: 1
--- a/gallery/flux.yaml
+++ b/gallery/flux.yaml
@ -12,4 +12,4 @@ config_file: |
    enable_parameters: num_inference_steps
    pipeline_type: FluxPipeline

-  cfg_scale: 0
+  cfg_scale: 1
--- a/Show more
+++ b/Show more