Compare commits

..

No commits in common. "master" and "v2.26.0" have entirely different histories.

240 changed files with 31406 additions and 11876 deletions

5
.env
View file

@ -29,9 +29,6 @@
## Enable/Disable single backend (useful if only one GPU is available) ## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true # LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas. ## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit. ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM. ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@ -76,7 +73,7 @@
### Define a list of GRPC Servers for llama-cpp workers to distribute the load ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
# https://github.com/ggerganov/llama.cpp/pull/6829 # https://github.com/ggerganov/llama.cpp/pull/6829
# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md # https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
# LLAMACPP_GRPC_SERVERS="" # LLAMACPP_GRPC_SERVERS=""
### Enable to run parallel requests ### Enable to run parallel requests

View file

@ -29,6 +29,10 @@ updates:
schedule: schedule:
# Check for updates to GitHub Actions every weekday # Check for updates to GitHub Actions every weekday
interval: "weekly" interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/autogptq"
schedule:
interval: "weekly"
- package-ecosystem: "pip" - package-ecosystem: "pip"
directory: "/backend/python/bark" directory: "/backend/python/bark"
schedule: schedule:

View file

@ -9,10 +9,10 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- repository: "ggml-org/llama.cpp" - repository: "ggerganov/llama.cpp"
variable: "CPPLLAMA_VERSION" variable: "CPPLLAMA_VERSION"
branch: "master" branch: "master"
- repository: "ggml-org/whisper.cpp" - repository: "ggerganov/whisper.cpp"
variable: "WHISPER_CPP_VERSION" variable: "WHISPER_CPP_VERSION"
branch: "master" branch: "master"
- repository: "PABannier/bark.cpp" - repository: "PABannier/bark.cpp"

View file

@ -14,7 +14,7 @@ jobs:
steps: steps:
- name: Dependabot metadata - name: Dependabot metadata
id: metadata id: metadata
uses: dependabot/fetch-metadata@v2.4.0 uses: dependabot/fetch-metadata@v2.3.0
with: with:
github-token: "${{ secrets.GITHUB_TOKEN }}" github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true skip-commit-verification: true

View file

@ -33,7 +33,7 @@ jobs:
run: | run: |
CGO_ENABLED=0 make build-api CGO_ENABLED=0 make build-api
- name: rm - name: rm
uses: appleboy/ssh-action@v1.2.2 uses: appleboy/ssh-action@v1.2.0
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -42,7 +42,7 @@ jobs:
script: | script: |
sudo rm -rf local-ai/ || true sudo rm -rf local-ai/ || true
- name: copy file via ssh - name: copy file via ssh
uses: appleboy/scp-action@v1.0.0 uses: appleboy/scp-action@v0.1.7
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -53,7 +53,7 @@ jobs:
rm: true rm: true
target: ./local-ai target: ./local-ai
- name: restarting - name: restarting
uses: appleboy/ssh-action@v1.2.2 uses: appleboy/ssh-action@v1.2.0
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View file

@ -15,7 +15,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
runs-on: 'ubuntu-latest' runs-on: 'ubuntu-latest'
platforms: 'linux/amd64' platforms: 'linux/amd64'
runs-on: ${{matrix.runs-on}} runs-on: ${{matrix.runs-on}}

View file

@ -33,7 +33,6 @@ jobs:
# Pushing with all jobs in parallel # Pushing with all jobs in parallel
# eats the bandwidth of all the nodes # eats the bandwidth of all the nodes
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
fail-fast: false
matrix: matrix:
include: include:
# This is basically covered by the AIO test # This is basically covered by the AIO test
@ -57,35 +56,26 @@ jobs:
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas' # - build-type: 'hipblas'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
tag-suffix: '-hipblas' # tag-suffix: '-hipblas'
ffmpeg: 'false' # ffmpeg: 'false'
image-type: 'extras' # image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1" # base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' # - build-type: 'sycl_f16'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" # base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
tag-suffix: 'sycl-f16-ffmpeg' # tag-suffix: 'sycl-f16-ffmpeg'
ffmpeg: 'true' # ffmpeg: 'true'
image-type: 'extras' # image-type: 'extras'
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'vulkan'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-vulkan-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
# core-image-build: # core-image-build:
# uses: ./.github/workflows/image_build.yml # uses: ./.github/workflows/image_build.yml
# with: # with:

View file

@ -45,13 +45,13 @@ jobs:
- build-type: 'hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '-hipblas-extras' tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
aio: "-aio-gpu-hipblas" aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
latest-image: 'latest-gpu-hipblas-extras' latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas' latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
@ -59,13 +59,32 @@ jobs:
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-hipblas' tag-suffix: '-hipblas'
ffmpeg: 'false'
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
self-hosted-jobs: self-hosted-jobs:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@ -95,58 +114,110 @@ jobs:
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
matrix: matrix:
include: include:
# Extra images
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "11" cuda-major-version: "11"
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11-extras' tag-suffix: '-cublas-cuda11'
ffmpeg: 'true' ffmpeg: ''
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11-extras'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12-extras' tag-suffix: '-cublas-cuda12'
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda11-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-12" aio: "-aio-gpu-nvidia-cuda-12"
latest-image: 'latest-gpu-nvidia-cuda-12-extras' latest-image: 'latest-gpu-nvidia-cuda-12'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12' latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' - build-type: 'sycl_f16'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-extras' tag-suffix: '-sycl-f16-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f16" aio: "-aio-gpu-intel-f16"
latest-image: 'latest-gpu-intel-f16-extras' latest-image: 'latest-gpu-intel-f16'
latest-image-aio: 'latest-aio-gpu-intel-f16' latest-image-aio: 'latest-aio-gpu-intel-f16'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-extras' tag-suffix: '-sycl-f32-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f32" aio: "-aio-gpu-intel-f32"
latest-image: 'latest-gpu-intel-f32-extras' latest-image: 'latest-gpu-intel-f32'
latest-image-aio: 'latest-aio-gpu-intel-f32' latest-image-aio: 'latest-aio-gpu-intel-f32'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
# Core images # Core images
@ -155,23 +226,41 @@ jobs:
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16' tag-suffix: '-sycl-f16-core'
ffmpeg: 'true' ffmpeg: 'false'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f16'
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32' tag-suffix: '-sycl-f32-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f32'
core-image-build: core-image-build:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
@ -204,7 +293,7 @@ jobs:
- build-type: '' - build-type: ''
platforms: 'linux/amd64,linux/arm64' platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '' tag-suffix: '-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
@ -219,38 +308,60 @@ jobs:
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11' tag-suffix: '-cublas-cuda11-core'
ffmpeg: 'true' ffmpeg: ''
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false' skip-drivers: 'false'
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12' tag-suffix: '-cublas-cuda12-core'
ffmpeg: ''
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false' skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'vulkan' - build-type: 'vulkan'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-vulkan' tag-suffix: '-vulkan-ffmpeg-core'
latest-image: 'latest-vulkan-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false' skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-vulkan'
gh-runner: gh-runner:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@ -283,8 +394,8 @@ jobs:
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/arm64' platforms: 'linux/arm64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-nvidia-l4t-arm64' tag-suffix: '-nvidia-l4t-arm64-core'
latest-image: 'latest-nvidia-l4t-arm64' latest-image: 'latest-nvidia-l4t-arm64-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"

View file

@ -310,11 +310,6 @@ jobs:
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }} tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }} labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
- name: Cleanup
run: |
docker builder prune -f
docker system prune --force --volumes --all
- name: Latest tag - name: Latest tag
# run this on branches, when it is a tag and there is a latest-image defined # run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag' if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'

View file

@ -8,7 +8,7 @@ jobs:
notify-discord: notify-discord:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -16,7 +16,7 @@ jobs:
fetch-depth: 0 # needed to checkout all branches for this Action to work fetch-depth: 0 # needed to checkout all branches for this Action to work
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0 - uses: GrantBirki/git-diff-action@v2.8.0
id: git-diff-action id: git-diff-action
@ -79,7 +79,7 @@ jobs:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -87,7 +87,7 @@ jobs:
notify-twitter: notify-twitter:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -161,7 +161,7 @@ jobs:
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View file

@ -14,7 +14,7 @@ jobs:
steps: steps:
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
- name: Summarize - name: Summarize
id: summarize id: summarize
run: | run: |
@ -60,4 +60,4 @@ jobs:
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4" DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
uses: Ilshidur/action-discord@master uses: Ilshidur/action-discord@master
with: with:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}

View file

@ -36,7 +36,6 @@ jobs:
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
make install-go-tools
- name: Install CUDA Dependencies - name: Install CUDA Dependencies
run: | run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@ -124,7 +123,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -152,7 +151,6 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
make install-go-tools
- name: Intel Dependencies - name: Intel Dependencies
run: | run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@ -234,7 +232,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -255,7 +253,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc brew install protobuf grpc
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
- name: Build - name: Build
id: build id: build
run: | run: |
@ -276,7 +275,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -296,7 +295,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc libomp llvm brew install protobuf grpc libomp llvm
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build - name: Build
id: build id: build
run: | run: |
@ -317,7 +317,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View file

@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner - name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.22.4 uses: securego/gosec@v2.22.0
with: with:
# we let the report trigger content trigger a failure using the GitHub Security features. # we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...' args: '-no-fail -fmt sarif -out results.sarif ./...'

View file

@ -78,26 +78,6 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/diffusers make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test make --jobs=5 --output-sync=target -C backend/python/diffusers test
#tests-vllm:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install -y build-essential ffmpeg
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# sudo apt-get install -y libopencv-dev
# # Install UV
# curl -LsSf https://astral.sh/uv/install.sh | sh
# pip install --user --no-cache-dir grpcio-tools==1.64.1
# - name: Test vllm backend
# run: |
# make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test
# tests-transformers-musicgen: # tests-transformers-musicgen:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
# steps: # steps:

View file

@ -71,7 +71,7 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
sudo apt-get install -y libgmock-dev clang sudo apt-get install -y libgmock-dev
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@ -96,7 +96,6 @@ jobs:
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
# The python3-grpc-tools package in 22.04 is too old # The python3-grpc-tools package in 22.04 is too old
pip install --user grpcio-tools pip install --user grpcio-tools
@ -131,7 +130,7 @@ jobs:
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -184,7 +183,6 @@ jobs:
rm protoc.zip rm protoc.zip
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
PATH="$PATH:$HOME/go/bin" make protogen-go PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Build images - name: Build images
run: | run: |
@ -196,7 +194,7 @@ jobs:
make run-e2e-aio make run-e2e-aio
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -224,7 +222,6 @@ jobs:
run: | run: |
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
pip install --user --no-cache-dir grpcio-tools pip install --user --no-cache-dir grpcio-tools
go install github.com/GeertJohan/go.rice/rice@latest
- name: Test - name: Test
run: | run: |
export C_INCLUDE_PATH=/usr/local/include export C_INCLUDE_PATH=/usr/local/include
@ -235,7 +232,7 @@ jobs:
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View file

@ -15,7 +15,7 @@ ARG TARGETARCH
ARG TARGETVARIANT ARG TARGETVARIANT
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh" ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
@ -24,7 +24,6 @@ RUN apt-get update && \
ca-certificates \ ca-certificates \
curl libssl-dev \ curl libssl-dev \
git \ git \
git-lfs \
unzip upx-ucl && \ unzip upx-ucl && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
@ -46,10 +45,9 @@ EOT
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
# Install grpc compilers and rice # Install grpc compilers
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/ COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
RUN update-ca-certificates RUN update-ca-certificates
@ -301,9 +299,10 @@ COPY .git .
RUN make prepare RUN make prepare
## Build the binary ## Build the binary
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space ## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
## Otherwise just run the normal build ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ ## (both will use CUDA or hipblas for the actual computation)
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \ else \
make build; \ make build; \
@ -431,6 +430,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/vllm \ make -C backend/python/vllm \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/autogptq \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/bark \ make -C backend/python/bark \
; fi && \ ; fi && \

102
Makefile
View file

@ -6,11 +6,11 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
CPPLLAMA_VERSION?=6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=d1f114da61b1ae1e70b03104fad42c9dd666feeb WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
# go-piper version # go-piper version
PIPER_REPO?=https://github.com/mudler/go-piper PIPER_REPO?=https://github.com/mudler/go-piper
@ -21,11 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0 BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml) # stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
# ONEAPI variables for SYCL
export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
ONNX_VERSION?=1.20.0 ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64 ONNX_ARCH?=x64
@ -33,12 +30,8 @@ ONNX_OS?=linux
export BUILD_TYPE?= export BUILD_TYPE?=
export STABLE_BUILD_TYPE?=$(BUILD_TYPE) export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF export CMAKE_ARGS?=
export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
export BACKEND_LIBS?= export BACKEND_LIBS?=
export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
CGO_LDFLAGS?= CGO_LDFLAGS?=
CGO_LDFLAGS_WHISPER?= CGO_LDFLAGS_WHISPER?=
@ -88,7 +81,6 @@ endif
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
ifeq ($(NATIVE),false) ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF CMAKE_ARGS+=-DGGML_NATIVE=OFF
WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif endif
# Detect if we are running on arm64 # Detect if we are running on arm64
@ -116,31 +108,13 @@ ifeq ($(OS),Darwin)
# disable metal if on Darwin and any other value is explicitly passed. # disable metal if on Darwin and any other value is explicitly passed.
else ifneq ($(BUILD_TYPE),metal) else ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF CMAKE_ARGS+=-DGGML_METAL=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
export GGML_NO_ACCELERATE=1 export GGML_NO_ACCELERATE=1
export GGML_NO_METAL=1 export GGML_NO_METAL=1
GO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
ifeq ($(BUILD_TYPE),metal) ifeq ($(BUILD_TYPE),metal)
# -lcblas removed: it seems to always be listed as a duplicate flag.
CGO_LDFLAGS += -framework Accelerate CGO_LDFLAGS += -framework Accelerate
CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
CMAKE_ARGS+=-DGGML_OPENMP=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
else
CGO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
else else
CGO_LDFLAGS_WHISPER+=-lgomp CGO_LDFLAGS_WHISPER+=-lgomp
@ -152,29 +126,21 @@ ifeq ($(BUILD_TYPE),openblas)
endif endif
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export GGML_CUDA=1 export GGML_CUDA=1
CMAKE_ARGS+=-DGGML_CUDA=ON CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
endif endif
ifeq ($(BUILD_TYPE),vulkan) ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=1 CMAKE_ARGS+=-DGGML_VULKAN=1
WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
endif endif
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export GGML_SYCL=1 export GGML_SYCL=1
CMAKE_ARGS+=-DGGML_SYCL=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
export GGML_SYCL_F16=1 export GGML_SYCL_F16=1
CMAKE_ARGS+=-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),hipblas) ifeq ($(BUILD_TYPE),hipblas)
@ -185,7 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
export CC=$(ROCM_HOME)/llvm/bin/clang export CC=$(ROCM_HOME)/llvm/bin/clang
export STABLE_BUILD_TYPE= export STABLE_BUILD_TYPE=
export GGML_HIP=1 export GGML_HIP=1
GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
AMDGPU_TARGETS ?= "$(GPU_TARGETS)" AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@ -294,7 +260,11 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a $(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion-ggml
endif
sources/onnxruntime: sources/onnxruntime:
mkdir -p sources/onnxruntime mkdir -p sources/onnxruntime
@ -320,9 +290,8 @@ sources/whisper.cpp:
git checkout $(WHISPER_CPP_VERSION) && \ git checkout $(WHISPER_CPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
cd sources/whisper.cpp/build && cmake --build . --config Release
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
@ -372,14 +341,8 @@ clean-tests:
clean-dc: clean clean-dc: clean
cp -r /build/backend-assets /workspace/backend-assets cp -r /build/backend-assets /workspace/backend-assets
## Install Go tools
install-go-tools:
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install github.com/GeertJohan/go.rice/rice@latest
## Build: ## Build:
build: prepare backend-assets grpcs install-go-tools ## Build the project build: prepare backend-assets grpcs ## Build the project
$(info ${GREEN}I local-ai build info:${RESET}) $(info ${GREEN}I local-ai build info:${RESET})
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET}) $(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET}) $(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@ -389,9 +352,7 @@ ifneq ($(BACKEND_LIBS),)
$(MAKE) backend-assets/lib $(MAKE) backend-assets/lib
cp -f $(BACKEND_LIBS) backend-assets/lib/ cp -f $(BACKEND_LIBS) backend-assets/lib/
endif endif
rm -rf $(BINARY_NAME) || true
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./ CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
rice append --exec $(BINARY_NAME)
build-minimal: build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@ -463,7 +424,6 @@ prepare-test: grpcs
cp -rf backend-assets core/http cp -rf backend-assets core/http
cp tests/models_fixtures/* test-models cp tests/models_fixtures/* test-models
## Test targets
test: prepare test-models/testmodel.ggml grpcs test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests' @echo 'Running tests'
export GO_TAGS="tts debug" export GO_TAGS="tts debug"
@ -538,7 +498,7 @@ protogen: protogen-go protogen-python
protogen-clean: protogen-go-clean protogen-python-clean protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go .PHONY: protogen-go
protogen-go: install-go-tools protogen-go:
mkdir -p pkg/grpc/proto mkdir -p pkg/grpc/proto
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \ protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto backend/backend.proto
@ -549,10 +509,18 @@ protogen-go-clean:
$(RM) bin/* $(RM) bin/*
.PHONY: protogen-python .PHONY: protogen-python
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
.PHONY: protogen-python-clean .PHONY: protogen-python-clean
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
.PHONY: autogptq-protogen
autogptq-protogen:
$(MAKE) -C backend/python/autogptq protogen
.PHONY: autogptq-protogen-clean
autogptq-protogen-clean:
$(MAKE) -C backend/python/autogptq protogen-clean
.PHONY: bark-protogen .PHONY: bark-protogen
bark-protogen: bark-protogen:
@ -629,6 +597,7 @@ vllm-protogen-clean:
## GRPC ## GRPC
# Note: it is duplicated in the Dockerfile # Note: it is duplicated in the Dockerfile
prepare-extra-conda-environments: protogen-python prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/autogptq
$(MAKE) -C backend/python/bark $(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/coqui $(MAKE) -C backend/python/coqui
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
@ -642,12 +611,10 @@ prepare-extra-conda-environments: protogen-python
prepare-test-extra: protogen-python prepare-test-extra: protogen-python
$(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/vllm
test-extra: prepare-test-extra test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test $(MAKE) -C backend/python/transformers test
$(MAKE) -C backend/python/diffusers test $(MAKE) -C backend/python/diffusers test
$(MAKE) -C backend/python/vllm test
backend-assets: backend-assets:
mkdir -p backend-assets mkdir -p backend-assets
@ -789,8 +756,8 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/silero-vad $(UPX) backend-assets/grpc/silero-vad
endif endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \ CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
ifneq ($(UPX),) ifneq ($(UPX),)
$(UPX) backend-assets/grpc/whisper $(UPX) backend-assets/grpc/whisper
@ -842,8 +809,7 @@ docker-aio-all:
docker-image-intel: docker-image-intel:
docker build \ docker build \
--progress plain \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -851,7 +817,7 @@ docker-image-intel:
docker-image-intel-xpu: docker-image-intel-xpu:
docker build \ docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \

166
README.md
View file

@ -1,6 +1,7 @@
<h1 align="center"> <h1 align="center">
<br> <br>
<img height="300" src="./core/http/static/logo.png"> <br> <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
LocalAI
<br> <br>
</h1> </h1>
@ -30,7 +31,7 @@
<p align="center"> <p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank"> <a href="https://twitter.com/LocalAI_API" target="blank">
<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/> <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
</a> </a>
<a href="https://discord.gg/uJAeKSAGDy" target="blank"> <a href="https://discord.gg/uJAeKSAGDy" target="blank">
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/> <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@ -43,154 +44,35 @@
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/) > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
> >
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on > [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai) [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler). **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
## 📚🆕 Local Stack Family
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
<table>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalAGI">
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
</td>
</tr>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalRecall">
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
</td>
</tr>
</table>
## Screenshots
| Talk Interface | Generate Audio |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
| Models Overview | Generate Images |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
| Chat Interface | Home |
| --- | --- |
| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
| Login | Swarm |
| --- | --- |
|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
## 💻 Quickstart
Run the installer script: Run the installer script:
```bash ```bash
# Basic installation
curl https://localai.io/install.sh | sh curl https://localai.io/install.sh | sh
``` ```
For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
Or run with docker: Or run with docker:
### CPU only image:
```bash ```bash
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest # CPU only image:
``` docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
### NVIDIA GPU Images: # Nvidia GPU:
```bash
# CUDA 12.0 with core features
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
# CUDA 12.0 with extra Python dependencies # CPU and GPU image (bigger size):
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12-extras docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
# CUDA 11.7 with core features # AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
# CUDA 11.7 with extra Python dependencies
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11-extras
# NVIDIA Jetson (L4T) ARM64
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
```
### AMD GPU Images (ROCm):
```bash
# ROCm with core features
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
# ROCm with extra Python dependencies
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas-extras
```
### Intel GPU Images (oneAPI):
```bash
# Intel GPU with FP16 support
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
# Intel GPU with FP16 support and extra dependencies
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16-extras
# Intel GPU with FP32 support
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
# Intel GPU with FP32 support and extra dependencies
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32-extras
```
### Vulkan GPU Images:
```bash
# Vulkan with core features
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
```
### AIO Images (pre-downloaded models):
```bash
# CPU version
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
# NVIDIA CUDA 12 version
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
# NVIDIA CUDA 11 version
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
# Intel GPU version
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
# AMD GPU version
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
``` ```
For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
To load models: To load models:
```bash ```bash
@ -206,13 +88,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
local-ai run oci://localai/phi-2:latest local-ai run oci://localai/phi-2:latest
``` ```
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html) [💻 Getting started](https://localai.io/basics/getting_started/index.html)
## 📰 Latest project news ## 📰 Latest project news
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 ) - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 ) - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@ -226,6 +105,19 @@ For more information, see [💻 Getting started](https://localai.io/basics/getti
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
## 🔥🔥 Hot topics (looking for help):
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
## 🚀 [Features](https://localai.io/features/) ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table)) - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@ -239,10 +131,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
- 🥽 [Vision API](https://localai.io/features/gpt-vision/) - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 📈 [Reranker API](https://localai.io/features/reranker/) - 📈 [Reranker API](https://localai.io/features/reranker/)
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/) - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
- 🔊 Voice activity detection (Silero-VAD support) - 🔊 Voice activity detection (Silero-VAD support)
- 🌍 Integrated WebUI! - 🌍 Integrated WebUI!
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 🔗 Community and integrations ### 🔗 Community and integrations
@ -318,7 +212,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
<p align="center"> <p align="center">
<a href="https://www.spectrocloud.com/" target="blank"> <a href="https://www.spectrocloud.com/" target="blank">
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962"> <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a> </a>
<a href="https://www.premai.io/" target="blank"> <a href="https://www.premai.io/" target="blank">
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br> <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
embeddings: true
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,57 +1,101 @@
context_size: 8192
f16: true
function:
grammar:
no_mixed_free_string: true
schema_type: llama3.1 # or JSON is supported too (json)
response_regex:
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- <|eot_id|> - "</tool_call>"
- <|end_of_text|> - "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|> {{.Input -}}
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|> <|im_start|>assistant
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: | chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|> <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
{{ else if eq .RoleName "tool" -}} <tool_call>
The Function was executed and the response was: {{- else if eq .RoleName "tool" }}
{{ end -}} <tool_response>
{{ if .Content -}} {{- end }}
{{.Content -}} {{- if .Content}}
{{ else if .FunctionCall -}} {{.Content }}
{{ range .FunctionCall }} {{- end }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})] {{- if .FunctionCall}}
{{ end }} {{toJson .FunctionCall}}
{{ end -}} {{- end }}
<|eot_id|> {{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|start_header_id|>system<|end_header_id|> <|im_start|>system
You are an expert in composing functions. You are given a question and a set of possible functions. You are a function calling AI model.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. Here are the available tools:
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections. <tools>
If you decide to invoke any of the function(s), you MUST put it in the format as follows: {{range .Functions}}
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)] {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
You SHOULD NOT include any other text in the response. {{end}}
Here is a list of functions in JSON format that you can invoke. </tools>
{{toJson .Functions}} You should call the tools provided to you sequentially
<|eot_id|><|start_header_id|>user<|end_header_id|> Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
{{.Input}} <scratchpad>
<|eot_id|><|start_header_id|>assistant<|end_header_id|> {step-by-step reasoning and plan in bullet points}
</scratchpad>
download_files: For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf <tool_call>
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5 {"arguments": <args-dict>, "name": <function-name>}
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf </tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant

View file

@ -1,49 +1,31 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: bakllava.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: bakllava.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,53 +1,101 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf

View file

@ -1,49 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,53 +1,103 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: false
context_size: 8192
f16: false
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf

View file

@ -1,50 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true mmap: false
mmap: true f16: false
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -1,15 +1,6 @@
package main package main
import ( import "embed"
rice "github.com/GeertJohan/go.rice"
)
var backendAssets *rice.Box //go:embed backend-assets/*
var backendAssets embed.FS
func init() {
var err error
backendAssets, err = rice.FindBox("backend-assets")
if err != nil {
panic(err)
}
}

View file

@ -14,7 +14,6 @@ service Backend {
rpc PredictStream(PredictOptions) returns (stream Reply) {} rpc PredictStream(PredictOptions) returns (stream Reply) {}
rpc Embedding(PredictOptions) returns (EmbeddingResult) {} rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
rpc GenerateImage(GenerateImageRequest) returns (Result) {} rpc GenerateImage(GenerateImageRequest) returns (Result) {}
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc TTS(TTSRequest) returns (Result) {} rpc TTS(TTSRequest) returns (Result) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@ -166,6 +165,7 @@ message Reply {
message GrammarTrigger { message GrammarTrigger {
string word = 1; string word = 1;
bool at_start = 2;
} }
message ModelOptions { message ModelOptions {
@ -191,7 +191,11 @@ message ModelOptions {
int32 NGQA = 20; int32 NGQA = 20;
string ModelFile = 21; string ModelFile = 21;
// AutoGPTQ
string Device = 22;
bool UseTriton = 23;
string ModelBaseName = 24;
bool UseFastTokenizer = 25;
// Diffusers // Diffusers
string PipelineType = 26; string PipelineType = 26;
@ -225,11 +229,6 @@ message ModelOptions {
int32 MaxModelLen = 54; int32 MaxModelLen = 54;
int32 TensorParallelSize = 55; int32 TensorParallelSize = 55;
string LoadFormat = 58; string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;
string MMProj = 41; string MMProj = 41;
@ -302,19 +301,6 @@ message GenerateImageRequest {
int32 CLIPSkip = 11; int32 CLIPSkip = 11;
} }
message GenerateVideoRequest {
string prompt = 1;
string start_image = 2; // Path or base64 encoded image for the start frame
string end_image = 3; // Path or base64 encoded image for the end frame
int32 width = 4;
int32 height = 5;
int32 num_frames = 6; // Number of frames to generate
int32 fps = 7; // Frames per second
int32 seed = 8;
float cfg_scale = 9; // Classifier-free guidance scale
string dst = 10; // Output path for the generated video
}
message TTSRequest { message TTSRequest {
string text = 1; string text = 1;
string model = 2; string model = 2;

View file

@ -1,17 +1,17 @@
## XXX: In some versions of CMake clip wasn't being built before llama. ## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future. ## This is an hack for now, but it should be fixed in the future.
# set(TARGET myclip) set(TARGET myclip)
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h) add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
# install(TARGETS ${TARGET} LIBRARY) install(TARGETS ${TARGET} LIBRARY)
# target_include_directories(myclip PUBLIC .) target_include_directories(myclip PUBLIC .)
# target_include_directories(myclip PUBLIC ../..) target_include_directories(myclip PUBLIC ../..)
# target_include_directories(myclip PUBLIC ../../common) target_include_directories(myclip PUBLIC ../../common)
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
# target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
# if (NOT MSVC) if (NOT MSVC)
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
# endif() endif()
# END CLIP hack # END CLIP hack
@ -74,12 +74,8 @@ add_library(hw_grpc_proto
${hw_proto_srcs} ${hw_proto_srcs}
${hw_proto_hdrs} ) ${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h) add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
absl::flags_parse absl::flags_parse
gRPC::${_REFLECTION} gRPC::${_REFLECTION}
gRPC::${_GRPC_GRPCPP} gRPC::${_GRPC_GRPCPP}

View file

@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f32) ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif endif
llama.cpp: llama.cpp:
@ -59,8 +52,8 @@ llama.cpp:
git checkout -b build $(LLAMA_VERSION) && \ git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp llama.cpp/examples/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server mkdir -p llama.cpp/examples/grpc-server
bash prepare.sh bash prepare.sh
rebuild: rebuild:
@ -70,13 +63,13 @@ rebuild:
purge: purge:
rm -rf llama.cpp/build rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server rm -rf llama.cpp/examples/grpc-server
rm -rf grpc-server rm -rf grpc-server
clean: purge clean: purge
rm -rf llama.cpp rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server grpc-server: llama.cpp llama.cpp/examples/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)" @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \ +bash -c "source $(ONEAPI_VARS); \
@ -84,4 +77,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
else else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) +cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
endif endif
cp llama.cpp/build/bin/grpc-server . cp llama.cpp/build/bin/grpc-server .

File diff suppressed because it is too large Load diff

24596
backend/cpp/llama/json.hpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..6c5e811a 100644 index 3cd0d2fa..6c5e811a 100644
--- a/tools/mtmd/clip.cpp --- a/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/examples/llava/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima @@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches)); int* patches_data = (int*)malloc(ggml_nbytes(patches));

View file

@ -7,46 +7,21 @@ for patch in $(ls patches); do
patch -d llama.cpp/ -p1 < patches/$patch patch -d llama.cpp/ -p1 < patches/$patch
done done
set -e cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ cp -rfv utils.hpp llama.cpp/examples/grpc-server/
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/ if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
set +e
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
echo "grpc-server already added" echo "grpc-server already added"
else else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
fi fi
set -e
# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes ## XXX: In some versions of CMake clip wasn't being built before llama.
# and remove the main function ## This is an hack for now, but it should be fixed in the future.
# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
awk ' cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
/int[ \t]+main[ \t]*\(/ { # If the line starts the main function echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
in_main=1; # Set a flag cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
open_braces=0; # Track number of open braces cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
}
in_main {
open_braces += gsub(/\{/, "{"); # Count opening braces
open_braces -= gsub(/\}/, "}"); # Count closing braces
if (open_braces == 0) { # If all braces are closed
in_main=0; # End skipping
}
next; # Skip lines inside main
}
!in_main # Print lines not inside main
' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
# remove index.html.gz.hpp and loading.html.hpp includes
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
else
# Linux and others
sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
fi

483
backend/cpp/llama/utils.hpp vendored Normal file
View file

@ -0,0 +1,483 @@
// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json;
extern bool server_verbose;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
//
// parallel
//
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum task_type {
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
TASK_TYPE_NEXT_RESPONSE
};
struct task_server {
int id = -1; // to be filled by llama_server_queue
int target_id;
task_type type;
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states
enum slot_state
{
IDLE,
PROCESSING,
};
enum slot_command
{
NONE,
LOAD_PROMPT,
RELEASE,
};
struct slot_params
{
bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_predict = -1; // new tokens to predict
std::vector<std::string> antiprompt;
json input_prefix;
json input_suffix;
};
struct slot_image
{
int32_t id;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
// completion token output with probabilities
struct completion_token_output
{
struct token_prob
{
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
std::string text_to_send;
};
static inline void server_log(const char *level, const char *function, int line,
const char *message, const nlohmann::ordered_json &extra)
{
nlohmann::ordered_json log
{
{"timestamp", time(nullptr)},
{"level", level},
{"function", function},
{"line", line},
{"message", message},
};
if (!extra.empty())
{
log.merge_patch(extra);
}
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
//
// server utils
//
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value)
{
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
inline std::string format_chatml(std::vector<json> messages)
{
std::ostringstream chatml_msgs;
for (auto it = messages.begin(); it != messages.end(); ++it) {
chatml_msgs << "<|im_start|>"
<< json_value(*it, "role", std::string("user")) << '\n';
chatml_msgs << json_value(*it, "content", std::string(""))
<< "<|im_end|>\n";
}
chatml_msgs << "<|im_start|>assistant" << '\n';
return chatml_msgs.str();
}
//
// work queue utils
//
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
std::vector<task_multi> queue_multitasks;
std::condition_variable condition_tasks;
// callback functions
std::function<void(task_server&)> callback_new_task;
std::function<void(task_multi&)> callback_finish_multitask;
std::function<void(void)> callback_all_task_finished;
// Add a new task to the end of the queue
int post(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
}
queue_tasks.push_back(std::move(task));
condition_tasks.notify_one();
return task.id;
}
// Add a new task, but defer until one slot is available
void defer(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
queue_tasks_deferred.push_back(std::move(task));
}
// Get the next id for creating anew task
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
return id++;
}
// Register function to process a new task
void on_new_task(std::function<void(task_server&)> callback) {
callback_new_task = callback;
}
// Register function to process a multitask
void on_finish_multitask(std::function<void(task_multi&)> callback) {
callback_finish_multitask = callback;
}
// Register the function to be called when the batch of tasks is finished
void on_all_tasks_finished(std::function<void(void)> callback) {
callback_all_task_finished = callback;
}
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking
[[noreturn]]
void start_loop() {
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
{
while (true)
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
lock.unlock();
break;
}
task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin());
lock.unlock();
LOG_VERBOSE("callback_new_task", {});
callback_new_task(task);
}
LOG_VERBOSE("callback_all_task_finished", {});
// process and update all the multitasks
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_multi current_multitask = *queue_iterator;
callback_finish_multitask(current_multitask);
// remove this multitask
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
// all tasks in the current loop is finished
callback_all_task_finished();
}
LOG_VERBOSE("wait for new task", {});
// wait for new task
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
});
}
}
}
}
//
// functions to manage multitasks
//
// add a multitask by specifying the id of all subtask (subtask is a task_server)
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = multitask_id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
// updatethe remaining subtasks, while appending results to multitask
void update_multitask(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
};
struct llama_server_response {
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
callback_multitask_t callback_update_multitask;
// for keeping track of all tasks waiting for the result
std::set<int> waiting_task_ids;
// the main result queue
std::vector<task_result> queue_results;
std::mutex mutex_results;
std::condition_variable condition_results;
void add_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
void remove_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
}
// This function blocks the thread until there is a response for this task_id
task_result recv(int task_id) {
while (true)
{
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
LOG_VERBOSE("condition_results unblock", {});
for (int i = 0; i < (int) queue_results.size(); i++)
{
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// Register the function to update multitask
void on_multitask_update(callback_multitask_t callback) {
callback_update_multitask = callback;
}
// Send a new result to a waiting task_id
void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {});
for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id)
{
LOG_VERBOSE("callback_update_multitask", {});
callback_update_multitask(task_id, result.id, result);
continue;
}
if (result.id == task_id)
{
LOG_VERBOSE("queue_results.push_back", {});
queue_results.push_back(result);
condition_results.notify_one();
return;
}
}
}
};
//
// base64 utils (TODO: move to common in the future)
//
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(uint8_t c)
{
return (isalnum(c) || (c == '+') || (c == '/'));
}
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
int in_ = 0;
int in_len = encoded_string.size();
uint8_t char_array_4[4];
uint8_t char_array_3[3];
std::vector<uint8_t> ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
{
char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4)
{
for (i = 0; i <4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
{
ret.push_back(char_array_3[i]);
}
i = 0;
}
}
if (i)
{
for (j = i; j <4; j++)
{
char_array_4[j] = 0;
}
for (j = 0; j <4; j++)
{
char_array_4[j] = base64_chars.find(char_array_4[j]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++)
{
ret.push_back(char_array_3[j]);
}
}
return ret;
}

View file

@ -48,7 +48,7 @@ int tts(char *text,int threads, char *dst ) {
// generate audio // generate audio
if (!bark_generate_audio(c, text, threads)) { if (!bark_generate_audio(c, text, threads)) {
fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__); fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
return 1; return 1;
} }

View file

@ -8,19 +8,12 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
# keep standard at C11 and C++11 # keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
GOCMD?=go
CGO_LDFLAGS?=
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
CGO_LDFLAGS_SYCL=
GO_TAGS?=
LD_FLAGS?=
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DSD_CUDA=ON CMAKE_ARGS+=-DGGML_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically # to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas) else ifeq ($(BUILD_TYPE),openblas)
@ -28,50 +21,31 @@ else ifeq ($(BUILD_TYPE),openblas)
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas) else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas) else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DSD_HIPBLAS=ON CMAKE_ARGS+=-DGGML_HIP=ON
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here # But if it's OSX without metal, disable it here
else ifeq ($(OS),Darwin) else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal) ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DSD_METAL=OFF CMAKE_ARGS+=-DGGML_METAL=OFF
else else
CMAKE_ARGS+=-DSD_METAL=ON CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
TARGET+=--target ggml-metal TARGET+=--target ggml-metal
endif endif
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) # ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \ # CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-DCMAKE_C_COMPILER=icx \ # endif
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON \
-DGGML_SYCL_F16=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
ifeq ($(BUILD_TYPE),sycl_f32) # ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \ # CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-DCMAKE_C_COMPILER=icx \ # endif
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
# warnings # warnings
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
# Find all .a archives in ARCHIVE_DIR # Find all .a archives in ARCHIVE_DIR
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive) # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@ -112,24 +86,11 @@ endif
$(MAKE) $(COMBINED_LIB) $(MAKE) $(COMBINED_LIB)
gosd.o: gosd.o:
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
else
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c $(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
endif
libsd.a: gosd.o libsd.a: gosd.o
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
$(AR) rcs libsd.a gosd.o $(AR) rcs libsd.a gosd.o
stablediffusion-ggml:
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
ifneq ($(UPX),)
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
endif
clean: clean:
rm -rf gosd.o libsd.a build $(COMBINED_LIB) rm -rf gosd.o libsd.a build $(COMBINED_LIB)

View file

@ -35,8 +35,6 @@ const char* sample_method_str[] = {
"ipndm", "ipndm",
"ipndm_v", "ipndm_v",
"lcm", "lcm",
"ddim_trailing",
"tcd",
}; };
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@ -175,7 +173,6 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
-1, //clip_skip -1, //clip_skip
cfg_scale, // sfg_scale cfg_scale, // sfg_scale
3.5f, 3.5f,
0, // eta
width, width,
height, height,
sample_method, sample_method,

View file

@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
context.SetTranslate(true) context.SetTranslate(true)
} }
if err := context.Process(data, nil, nil, nil); err != nil { if err := context.Process(data, nil, nil); err != nil {
return pb.TranscriptResult{}, err return pb.TranscriptResult{}, err
} }

View file

@ -0,0 +1,17 @@
.PHONY: autogptq
autogptq: protogen
bash install.sh
.PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py:
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View file

@ -0,0 +1,5 @@
# Creating a separate environment for the autogptq project
```
make autogptq
```

View file

@ -0,0 +1,153 @@
#!/usr/bin/env python3
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import base64
import grpc
import backend_pb2
import backend_pb2_grpc
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextGenerationPipeline
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
try:
device = "cuda:0"
if request.Device != "":
device = request.Device
# support loading local model files
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
# support model `Qwen/Qwen-VL-Chat-Int4`
if "qwen-vl" in request.Model.lower():
self.model_name = "Qwen-VL-Chat"
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=request.TrustRemoteCode,
device_map="auto").eval()
else:
model = AutoGPTQForCausalLM.from_quantized(model_path,
model_basename=request.ModelBaseName,
use_safetensors=True,
trust_remote_code=request.TrustRemoteCode,
device=device,
use_triton=request.UseTriton,
quantize_config=None)
self.model = model
self.tokenizer = tokenizer
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
penalty = 1.0
if request.Penalty != 0.0:
penalty = request.Penalty
tokens = 512
if request.Tokens != 0:
tokens = request.Tokens
top_p = 0.95
if request.TopP != 0.0:
top_p = request.TopP
prompt_images = self.recompile_vl_prompt(request)
compiled_prompt = prompt_images[0]
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
# Implement Predict RPC
pipeline = TextGenerationPipeline(
model=self.model,
tokenizer=self.tokenizer,
max_new_tokens=tokens,
temperature=request.Temperature,
top_p=top_p,
repetition_penalty=penalty,
)
t = pipeline(compiled_prompt)[0]["generated_text"]
print(f"generated_text: {t}", file=sys.stderr)
if compiled_prompt in t:
t = t.replace(compiled_prompt, "")
# house keeping. Remove the image files from /tmp folder
for img_path in prompt_images[1]:
try:
os.remove(img_path)
except Exception as e:
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
def PredictStream(self, request, context):
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def recompile_vl_prompt(self, request):
prompt = request.Prompt
image_paths = []
if "qwen-vl" in self.model_name.lower():
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
# Then, save the image file paths to an array "image_paths".
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
for i, img in enumerate(request.Images):
timestamp = str(int(time.time() * 1000)) # Generate timestamp
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
with open(img_path, "wb") as f:
f.write(base64.b64decode(img))
image_paths.append(img_path)
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
else:
prompt = request.Prompt
return (prompt, image_paths)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View file

@ -0,0 +1,14 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
fi
installRequirements

View file

@ -0,0 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118

View file

@ -0,0 +1 @@
torch==2.4.1

View file

@ -0,0 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0

View file

@ -0,0 +1,6 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu
torch==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
setuptools

View file

@ -0,0 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.70.0
protobuf
certifi
transformers

4
backend/python/autogptq/run.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
source $(dirname $0)/../common/libbackend.sh
startBackend $@

View file

@ -0,0 +1,6 @@
#!/bin/bash
set -e
source $(dirname $0)/../common/libbackend.sh
runUnittests

View file

@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,4 +1,4 @@
bark==0.1.5 bark==0.1.5
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
certifi certifi

View file

@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
grpcio-tools grpcio-tools

View file

@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,4 +1,4 @@
transformers==4.48.3 transformers
accelerate accelerate
torch==2.4.1 torch==2.4.1
coqui-tts coqui-tts

View file

@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118 torch==2.4.1+cu118
torchaudio==2.4.1+cu118 torchaudio==2.4.1+cu118
transformers==4.48.3 transformers
accelerate accelerate
coqui-tts coqui-tts

View file

@ -1,5 +1,5 @@
torch==2.4.1 torch==2.4.1
torchaudio==2.4.1 torchaudio==2.4.1
transformers==4.48.3 transformers
accelerate accelerate
coqui-tts coqui-tts

View file

@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0 torch==2.4.1+rocm6.0
torchaudio==2.4.1+rocm6.0 torchaudio==2.4.1+rocm6.0
transformers==4.48.3 transformers
accelerate accelerate
coqui-tts coqui-tts

View file

@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu oneccl_bind_pt==2.3.100+xpu
optimum[openvino] optimum[openvino]
setuptools setuptools
transformers==4.48.3 transformers
accelerate accelerate
coqui-tts coqui-tts

View file

@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
packaging==24.1 packaging==24.1

View file

@ -19,7 +19,7 @@ import grpc
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \ from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
from diffusers.pipelines.stable_diffusion import safety_checker from diffusers.pipelines.stable_diffusion import safety_checker
from diffusers.utils import load_image, export_to_video from diffusers.utils import load_image, export_to_video
from compel import Compel, ReturnedEmbeddingsType from compel import Compel, ReturnedEmbeddingsType
@ -168,13 +168,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# We are storing all the options in a dict so we can use it later when # We are storing all the options in a dict so we can use it later when
# generating the images # generating the images
for opt in options: for opt in options:
if ":" not in opt:
continue
key, value = opt.split(":") key, value = opt.split(":")
self.options[key] = value self.options[key] = value
print(f"Options: {self.options}", file=sys.stderr)
local = False local = False
modelFile = request.Model modelFile = request.Model
@ -291,12 +287,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
if request.LowVRAM: if request.LowVRAM:
self.pipe.enable_model_cpu_offload() self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "Lumina2Text2ImgPipeline":
self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
request.Model,
torch_dtype=torch.bfloat16)
if request.LowVRAM:
self.pipe.enable_model_cpu_offload()
elif request.PipelineType == "SanaPipeline": elif request.PipelineType == "SanaPipeline":
self.pipe = SanaPipeline.from_pretrained( self.pipe = SanaPipeline.from_pretrained(
request.Model, request.Model,
@ -526,12 +516,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,5 +1,5 @@
setuptools setuptools
grpcio==1.72.0 grpcio==1.70.0
pillow pillow
protobuf protobuf
certifi certifi

View file

@ -105,12 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
wheel wheel

View file

@ -62,12 +62,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.TranscriptResult(segments=resultSegments, text=text) return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
grpcio-tools grpcio-tools

View file

@ -99,12 +99,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
phonemizer phonemizer
scipy scipy

View file

@ -91,12 +91,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.RerankResult(usage=usage, results=results) return backend_pb2.RerankResult(usage=usage, results=results)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
certifi certifi

View file

@ -559,12 +559,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
async def serve(address): async def serve(address):
# Start asyncio gRPC server # Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
# Add the servicer to the server # Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address # Bind the server to the address

View file

@ -5,5 +5,4 @@ accelerate
transformers transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.4.1 sentence-transformers==3.4.1
protobuf==6.31.0

View file

@ -7,4 +7,3 @@ transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.4.1 sentence-transformers==3.4.1
protobuf==6.31.0

View file

@ -6,4 +6,3 @@ transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.4.1 sentence-transformers==3.4.1
protobuf==6.31.0

View file

@ -8,4 +8,3 @@ bitsandbytes
outetts outetts
bitsandbytes bitsandbytes
sentence-transformers==3.4.1 sentence-transformers==3.4.1
protobuf==6.31.0

View file

@ -9,4 +9,3 @@ intel-extension-for-transformers
bitsandbytes bitsandbytes
outetts outetts
sentence-transformers==3.4.1 sentence-transformers==3.4.1
protobuf==6.31.0

View file

@ -1,5 +1,5 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf==6.31.0 protobuf
certifi certifi
setuptools setuptools
scipy==1.15.1 scipy==1.15.1

View file

@ -109,17 +109,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
engine_args.swap_space = request.SwapSpace engine_args.swap_space = request.SwapSpace
if request.MaxModelLen != 0: if request.MaxModelLen != 0:
engine_args.max_model_len = request.MaxModelLen engine_args.max_model_len = request.MaxModelLen
if request.DisableLogStatus:
engine_args.disable_log_status = request.DisableLogStatus
if request.DType != "":
engine_args.dtype = request.DType
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
engine_args.limit_mm_per_prompt = {
"image": max(request.LimitImagePerPrompt, 1),
"video": max(request.LimitVideoPerPrompt, 1),
"audio": max(request.LimitAudioPerPrompt, 1)
}
try: try:
self.llm = AsyncLLMEngine.from_engine_args(engine_args) self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@ -194,40 +183,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
await iterations.aclose() await iterations.aclose()
async def _predict(self, request, context, streaming=False): async def _predict(self, request, context, streaming=False):
# Build the sampling parameters
# NOTE: this must stay in sync with the vllm backend
request_to_sampling_params = {
"N": "n",
"PresencePenalty": "presence_penalty",
"FrequencyPenalty": "frequency_penalty",
"RepetitionPenalty": "repetition_penalty",
"Temperature": "temperature",
"TopP": "top_p",
"TopK": "top_k",
"MinP": "min_p",
"Seed": "seed",
"StopPrompts": "stop",
"StopTokenIds": "stop_token_ids",
"BadWords": "bad_words",
"IncludeStopStrInOutput": "include_stop_str_in_output",
"IgnoreEOS": "ignore_eos",
"Tokens": "max_tokens",
"MinTokens": "min_tokens",
"Logprobs": "logprobs",
"PromptLogprobs": "prompt_logprobs",
"SkipSpecialTokens": "skip_special_tokens",
"SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
"TruncatePromptTokens": "truncate_prompt_tokens",
"GuidedDecoding": "guided_decoding",
}
# Build sampling parameters
sampling_params = SamplingParams(top_p=0.9, max_tokens=200) sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
if request.TopP != 0:
for request_field, param_field in request_to_sampling_params.items(): sampling_params.top_p = request.TopP
if hasattr(request, request_field): if request.Tokens > 0:
value = getattr(request, request_field) sampling_params.max_tokens = request.Tokens
if value not in (None, 0, [], False): if request.Temperature != 0:
setattr(sampling_params, param_field, value) sampling_params.temperature = request.Temperature
if request.TopK != 0:
sampling_params.top_k = request.TopK
if request.PresencePenalty != 0:
sampling_params.presence_penalty = request.PresencePenalty
if request.FrequencyPenalty != 0:
sampling_params.frequency_penalty = request.FrequencyPenalty
if request.StopPrompts:
sampling_params.stop = request.StopPrompts
if request.IgnoreEOS:
sampling_params.ignore_eos = request.IgnoreEOS
if request.Seed != 0:
sampling_params.seed = request.Seed
# Extract image paths and process images # Extract image paths and process images
prompt = request.Prompt prompt = request.Prompt
@ -293,7 +269,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def load_image(self, image_path: str): def load_image(self, image_path: str):
""" """
Load an image from the given file path or base64 encoded data. Load an image from the given file path or base64 encoded data.
Args: Args:
image_path (str): The path to the image file or base64 encoded data. image_path (str): The path to the image file or base64 encoded data.
@ -312,7 +288,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def load_video(self, video_path: str): def load_video(self, video_path: str):
""" """
Load a video from the given file path. Load a video from the given file path.
Args: Args:
video_path (str): The path to the image file. video_path (str): The path to the image file.
@ -333,12 +309,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
async def serve(address): async def serve(address):
# Start asyncio gRPC server # Start asyncio gRPC server
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
# Add the servicer to the server # Add the servicer to the server
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
# Bind the server to the address # Bind the server to the address
@ -364,4 +335,4 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
asyncio.run(serve(args.addr)) asyncio.run(serve(args.addr))

View file

@ -1,4 +1,4 @@
grpcio==1.72.0 grpcio==1.70.0
protobuf protobuf
certifi certifi
setuptools setuptools

View file

@ -75,53 +75,6 @@ class TestBackendServicer(unittest.TestCase):
finally: finally:
self.tearDown() self.tearDown()
def test_sampling_params(self):
"""
This method tests if all sampling parameters are correctly processed
NOTE: this does NOT test for correctness, just that we received a compatible response
"""
try:
self.setUp()
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
self.assertTrue(response.success)
req = backend_pb2.PredictOptions(
Prompt="The capital of France is",
TopP=0.8,
Tokens=50,
Temperature=0.7,
TopK=40,
PresencePenalty=0.1,
FrequencyPenalty=0.2,
RepetitionPenalty=1.1,
MinP=0.05,
Seed=42,
StopPrompts=["\n"],
StopTokenIds=[50256],
BadWords=["badword"],
IncludeStopStrInOutput=True,
IgnoreEOS=True,
MinTokens=5,
Logprobs=5,
PromptLogprobs=5,
SkipSpecialTokens=True,
SpacesBetweenSpecialTokens=True,
TruncatePromptTokens=10,
GuidedDecoding=True,
N=2,
)
resp = stub.Predict(req)
self.assertIsNotNone(resp.message)
self.assertIsNotNone(resp.logprobs)
except Exception as err:
print(err)
self.fail("sampling params service failed")
finally:
self.tearDown()
def test_embedding(self): def test_embedding(self):
""" """
This method tests if the embeddings are generated successfully This method tests if the embeddings are generated successfully

View file

@ -16,7 +16,7 @@ type Application struct {
func newApplication(appConfig *config.ApplicationConfig) *Application { func newApplication(appConfig *config.ApplicationConfig) *Application {
return &Application{ return &Application{
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath), backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend), modelLoader: model.NewModelLoader(appConfig.ModelPath),
applicationConfig: appConfig, applicationConfig: appConfig,
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath), templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
} }

View file

@ -43,12 +43,18 @@ func New(opts ...config.AppOption) (*Application, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to create ModelPath: %q", err) return nil, fmt.Errorf("unable to create ModelPath: %q", err)
} }
if options.GeneratedContentDir != "" { if options.ImageDir != "" {
err := os.MkdirAll(options.GeneratedContentDir, 0750) err := os.MkdirAll(options.ImageDir, 0750)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to create ImageDir: %q", err) return nil, fmt.Errorf("unable to create ImageDir: %q", err)
} }
} }
if options.AudioDir != "" {
err := os.MkdirAll(options.AudioDir, 0750)
if err != nil {
return nil, fmt.Errorf("unable to create AudioDir: %q", err)
}
}
if options.UploadDir != "" { if options.UploadDir != "" {
err := os.MkdirAll(options.UploadDir, 0750) err := os.MkdirAll(options.UploadDir, 0750)
if err != nil { if err != nil {
@ -137,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}() }()
} }
if options.LoadToMemory != nil && !options.SingleBackend { if options.LoadToMemory != nil {
for _, m := range options.LoadToMemory { for _, m := range options.LoadToMemory {
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options) cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
if err != nil { if err != nil {

View file

@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
var fn func() ([]float32, error) var fn func() ([]float32, error)
switch model := inferenceModel.(type) { switch model := inferenceModel.(type) {

View file

@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
fn := func() error { fn := func() error {
_, err := inferenceModel.GenerateImage( _, err := inferenceModel.GenerateImage(

View file

@ -53,7 +53,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
var protoMessages []*proto.Message var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages // if we are using the tokenizer template, we need to convert the messages to proto messages
@ -117,11 +116,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
} }
if tokenCallback != nil { if tokenCallback != nil {
if c.TemplateConfig.ReplyPrefix != "" {
tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
}
ss := "" ss := ""
var partialRune []byte var partialRune []byte
@ -171,13 +165,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
response := string(reply.Message)
if c.TemplateConfig.ReplyPrefix != "" {
response = c.TemplateConfig.ReplyPrefix + response
}
return LLMResponse{ return LLMResponse{
Response: response, Response: string(reply.Message),
Usage: tokenUsage, Usage: tokenUsage,
}, err }, err
} }

View file

@ -40,6 +40,10 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
grpcOpts := grpcModelOpts(c) grpcOpts := grpcModelOpts(c)
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts)) defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
if so.SingleBackend {
defOpts = append(defOpts, model.WithSingleActiveBackend())
}
if so.ParallelBackendRequests { if so.ParallelBackendRequests {
defOpts = append(defOpts, model.EnableParallelRequests) defOpts = append(defOpts, model.EnableParallelRequests)
} }
@ -99,7 +103,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
mmap = *c.MMap mmap = *c.MMap
} }
ctxSize := 4096 ctxSize := 1024
if c.ContextSize != nil { if c.ContextSize != nil {
ctxSize = *c.ContextSize ctxSize = *c.ContextSize
} }
@ -117,7 +121,8 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
triggers := make([]*pb.GrammarTrigger, 0) triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers { for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{ triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word, Word: t.Word,
AtStart: t.AtStart,
}) })
} }
@ -154,36 +159,35 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
SwapSpace: int32(c.SwapSpace), SwapSpace: int32(c.SwapSpace),
MaxModelLen: int32(c.MaxModelLen), MaxModelLen: int32(c.MaxModelLen),
TensorParallelSize: int32(c.TensorParallelSize), TensorParallelSize: int32(c.TensorParallelSize),
DisableLogStatus: c.DisableLogStatus, MMProj: c.MMProj,
DType: c.DType, FlashAttention: c.FlashAttention,
// LimitMMPerPrompt vLLM CacheTypeKey: c.CacheTypeK,
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt), CacheTypeValue: c.CacheTypeV,
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt), NoKVOffload: c.NoKVOffloading,
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt), YarnExtFactor: c.YarnExtFactor,
MMProj: c.MMProj, YarnAttnFactor: c.YarnAttnFactor,
FlashAttention: c.FlashAttention, YarnBetaFast: c.YarnBetaFast,
CacheTypeKey: c.CacheTypeK, YarnBetaSlow: c.YarnBetaSlow,
CacheTypeValue: c.CacheTypeV, NGQA: c.NGQA,
NoKVOffload: c.NoKVOffloading, RMSNormEps: c.RMSNormEps,
YarnExtFactor: c.YarnExtFactor, MLock: mmlock,
YarnAttnFactor: c.YarnAttnFactor, RopeFreqBase: c.RopeFreqBase,
YarnBetaFast: c.YarnBetaFast, RopeScaling: c.RopeScaling,
YarnBetaSlow: c.YarnBetaSlow, Type: c.ModelType,
NGQA: c.NGQA, RopeFreqScale: c.RopeFreqScale,
RMSNormEps: c.RMSNormEps, NUMA: c.NUMA,
MLock: mmlock, Embeddings: embeddings,
RopeFreqBase: c.RopeFreqBase, LowVRAM: lowVRAM,
RopeScaling: c.RopeScaling, NGPULayers: int32(nGPULayers),
Type: c.ModelType, MMap: mmap,
RopeFreqScale: c.RopeFreqScale, MainGPU: c.MainGPU,
NUMA: c.NUMA, Threads: int32(*c.Threads),
Embeddings: embeddings, TensorSplit: c.TensorSplit,
LowVRAM: lowVRAM, // AutoGPTQ
NGPULayers: int32(nGPULayers), ModelBaseName: c.AutoGPTQ.ModelBaseName,
MMap: mmap, Device: c.AutoGPTQ.Device,
MainGPU: c.MainGPU, UseTriton: c.AutoGPTQ.Triton,
Threads: int32(*c.Threads), UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
TensorSplit: c.TensorSplit,
// RWKV // RWKV
Tokenizer: c.Tokenizer, Tokenizer: c.Tokenizer,
} }

View file

@ -12,10 +12,10 @@ import (
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) { func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
rerankModel, err := loader.Load(opts...) rerankModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
if rerankModel == nil { if rerankModel == nil {
return nil, fmt.Errorf("could not load rerank model") return nil, fmt.Errorf("could not load rerank model")

View file

@ -26,26 +26,21 @@ func SoundGeneration(
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
soundGenModel, err := loader.Load(opts...) soundGenModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return "", nil, err return "", nil, err
} }
defer loader.Close()
if soundGenModel == nil { if soundGenModel == nil {
return "", nil, fmt.Errorf("could not load sound generation model") return "", nil, fmt.Errorf("could not load sound generation model")
} }
if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil { if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err) return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
} }
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio") fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
if err := os.MkdirAll(audioDir, 0750); err != nil { filePath := filepath.Join(appConfig.AudioDir, fileName)
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
}
fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
filePath := filepath.Join(audioDir, fileName)
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{ res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
Text: text, Text: text,

View file

@ -20,7 +20,6 @@ func TokenMetrics(
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer loader.Close()
if model == nil { if model == nil {
return nil, fmt.Errorf("could not loadmodel model") return nil, fmt.Errorf("could not loadmodel model")

View file

@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig) opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err = loader.Load(opts...) inferenceModel, err = loader.Load(opts...)
if err != nil { if err != nil {
return schema.TokenizeResponse{}, err return schema.TokenizeResponse{}, err
} }
defer loader.Close()
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath) predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
predictOptions.Prompt = s predictOptions.Prompt = s

View file

@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer ml.Close()
if transcriptionModel == nil { if transcriptionModel == nil {
return nil, fmt.Errorf("could not load transcription model") return nil, fmt.Errorf("could not load transcription model")

View file

@ -23,22 +23,21 @@ func ModelTTS(
) (string, *proto.Result, error) { ) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend)) opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
ttsModel, err := loader.Load(opts...) ttsModel, err := loader.Load(opts...)
if err != nil { if err != nil {
return "", nil, err return "", nil, err
} }
defer loader.Close()
if ttsModel == nil { if ttsModel == nil {
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model) return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
} }
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio") if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
if err := os.MkdirAll(audioDir, 0750); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err) return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
} }
fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav") fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
filePath := filepath.Join(audioDir, fileName) filePath := filepath.Join(appConfig.AudioDir, fileName)
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect. // We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
// This should be addressed in a follow up PR soon. // This should be addressed in a follow up PR soon.

View file

@ -19,8 +19,6 @@ func VAD(request *schema.VADRequest,
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer ml.Close()
req := proto.VADRequest{ req := proto.VADRequest{
Audio: request.Audio, Audio: request.Audio,
} }

View file

@ -1,36 +0,0 @@
package backend
import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/grpc/proto"
model "github.com/mudler/LocalAI/pkg/model"
)
func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err := loader.Load(
opts...,
)
if err != nil {
return nil, err
}
defer loader.Close()
fn := func() error {
_, err := inferenceModel.GenerateVideo(
appConfig.Context,
&proto.GenerateVideoRequest{
Height: height,
Width: width,
Prompt: prompt,
StartImage: startImage,
EndImage: endImage,
Dst: dst,
})
return err
}
return fn, nil
}

View file

@ -1,13 +1,11 @@
package cliContext package cliContext
import ( import "embed"
rice "github.com/GeertJohan/go.rice"
)
type Context struct { type Context struct {
Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"` Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"` LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI // This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
BackendAssets *rice.Box `kong:"-"` BackendAssets embed.FS `kong:"-"`
} }

View file

@ -21,7 +21,8 @@ type RunCMD struct {
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"` ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"` BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
GeneratedContentPath string `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"` ImagePath string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
AudioPath string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"` UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"` ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"` LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
@ -37,7 +38,7 @@ type RunCMD struct {
F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"` F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"` Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"` ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"` Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"` CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
@ -46,7 +47,7 @@ type RunCMD struct {
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"` CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"` UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"` APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"` DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"` DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"` OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"` UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
@ -80,7 +81,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
config.WithModelPath(r.ModelsPath), config.WithModelPath(r.ModelsPath),
config.WithContextSize(r.ContextSize), config.WithContextSize(r.ContextSize),
config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel), config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
config.WithGeneratedContentDir(r.GeneratedContentPath), config.WithImageDir(r.ImagePath),
config.WithAudioDir(r.AudioPath),
config.WithUploadDir(r.UploadPath), config.WithUploadDir(r.UploadPath),
config.WithConfigsDir(r.ConfigPath), config.WithConfigsDir(r.ConfigPath),
config.WithDynamicConfigDir(r.LocalaiConfigDir), config.WithDynamicConfigDir(r.LocalaiConfigDir),

View file

@ -70,11 +70,11 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
opts := &config.ApplicationConfig{ opts := &config.ApplicationConfig{
ModelPath: t.ModelsPath, ModelPath: t.ModelsPath,
Context: context.Background(), Context: context.Background(),
GeneratedContentDir: outputDir, AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath, AssetsDestination: t.BackendAssetsPath,
ExternalGRPCBackends: externalBackends, ExternalGRPCBackends: externalBackends,
} }
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
defer func() { defer func() {
err := ml.StopAllGRPC() err := ml.StopAllGRPC()

View file

@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
} }
cl := config.NewBackendConfigLoader(t.ModelsPath) cl := config.NewBackendConfigLoader(t.ModelsPath)
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil { if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
return err return err
} }

View file

@ -36,12 +36,12 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
text := strings.Join(t.Text, " ") text := strings.Join(t.Text, " ")
opts := &config.ApplicationConfig{ opts := &config.ApplicationConfig{
ModelPath: t.ModelsPath, ModelPath: t.ModelsPath,
Context: context.Background(), Context: context.Background(),
GeneratedContentDir: outputDir, AudioDir: outputDir,
AssetsDestination: t.BackendAssetsPath, AssetsDestination: t.BackendAssetsPath,
} }
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend) ml := model.NewModelLoader(opts.ModelPath)
defer func() { defer func() {
err := ml.StopAllGRPC() err := ml.StopAllGRPC()

Some files were not shown because too many files have changed in this diff Show more