Compare commits

..

1 commit

Author SHA1 Message Date
Ettore Di Giacinto
184fbc26bf Revert "feat: include tokens usage for streamed output (#4282)"
This reverts commit 0d6c3a7d57.
2024-12-08 16:31:48 +01:00
476 changed files with 37336 additions and 260603 deletions

View file

@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model_id": "facebook/musicgen-small",
"text": "Exciting 80s Newscast Interstitial",
"duration_seconds": 8
}
}

View file

@ -0,0 +1,17 @@
meta {
name: backend monitor
type: http
seq: 4
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
body: json
auth: none
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View file

@ -0,0 +1,21 @@
meta {
name: backend-shutdown
type: http
seq: 3
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View file

@ -0,0 +1,5 @@
{
"version": "1",
"name": "LocalAI Test Requests",
"type": "collection"
}

View file

@ -0,0 +1,6 @@
vars {
HOST: localhost
PORT: 8080
DEFAULT_MODEL: gpt-3.5-turbo
PROTOCOL: http://
}

View file

@ -0,0 +1,11 @@
meta {
name: get models list
type: http
seq: 2
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
body: none
auth: none
}

View file

@ -0,0 +1,25 @@
meta {
name: Generate image
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"prompt": "<positive prompt>|<negative prompt>",
"model": "model-name",
"step": 51,
"size": "1024x1024",
"image": ""
}
}

View file

@ -0,0 +1,24 @@
meta {
name: -completions
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"prompt": "function downloadFile(string url, string outputPath) {",
"max_tokens": 256,
"temperature": 0.5
}
}

View file

@ -0,0 +1,23 @@
meta {
name: -edits
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "What day of the wek is it?",
"instruction": "Fix the spelling mistakes"
}
}

View file

@ -0,0 +1,22 @@
meta {
name: -embeddings
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View file

@ -0,0 +1,30 @@
meta {
name: chat completion -simple- 1 message-
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [
{
"role": "user",
"content": "How could one use friction to cook an egg?"
}
],
"max_tokens": 256,
"temperature": 0.2,
"grammar": ""
}
}

View file

@ -0,0 +1,29 @@
meta {
name: chat-completions -long-
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
{"role": "user", "content": "How could one use electricity to cook an egg?"},
{"role": "assistant",
"content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
},
{"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
"max_tokens": 1024,
"temperature": 0.5
}
}

View file

@ -0,0 +1,25 @@
meta {
name: chat-completions -stream-
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
"max_tokens": 256,
"temperature": 0.9,
"stream": true
}
}

View file

@ -0,0 +1,22 @@
meta {
name: add model gallery
type: http
seq: 10
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
"name": "test"
}
}

View file

@ -0,0 +1,21 @@
meta {
name: delete model gallery
type: http
seq: 11
}
delete {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"name": "test"
}
}

View file

@ -0,0 +1,11 @@
meta {
name: list MODELS in galleries
type: http
seq: 7
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
body: none
auth: none
}

View file

@ -0,0 +1,11 @@
meta {
name: list model GALLERIES
type: http
seq: 8
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View file

@ -0,0 +1,11 @@
meta {
name: model delete
type: http
seq: 7
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View file

@ -0,0 +1,21 @@
meta {
name: model gallery apply -gist-
type: http
seq: 12
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
}
}

View file

@ -0,0 +1,22 @@
meta {
name: model gallery apply
type: http
seq: 9
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
"name": "codellama7b"
}
}

Binary file not shown.

View file

@ -0,0 +1,16 @@
meta {
name: transcribe
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
body: multipartForm
auth: none
}
body:multipart-form {
file: @file(transcription/gb1.ogg)
model: whisper-1
}

View file

@ -0,0 +1,22 @@
meta {
name: -tts
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View file

@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"backend": "transformers-musicgen",
"model": "facebook/musicgen-small",
"input": "80s Synths playing Jazz"
}
}

View file

@ -7,7 +7,7 @@ services:
args: args:
- FFMPEG=true - FFMPEG=true
- IMAGE_TYPE=extras - IMAGE_TYPE=extras
- GO_TAGS=p2p tts - GO_TAGS=stablediffusion p2p tts
env_file: env_file:
- ../.env - ../.env
ports: ports:

20
.env
View file

@ -29,9 +29,6 @@
## Enable/Disable single backend (useful if only one GPU is available) ## Enable/Disable single backend (useful if only one GPU is available)
# LOCALAI_SINGLE_ACTIVE_BACKEND=true # LOCALAI_SINGLE_ACTIVE_BACKEND=true
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
## Specify a build type. Available: cublas, openblas, clblas. ## Specify a build type. Available: cublas, openblas, clblas.
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit. ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM. ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@ -41,12 +38,12 @@
## Uncomment and set to true to enable rebuilding from source ## Uncomment and set to true to enable rebuilding from source
# REBUILD=true # REBUILD=true
## Enable go tags, available: p2p, tts ## Enable go tags, available: stablediffusion, tts
## p2p: enable distributed inferencing ## stablediffusion: image generation with stablediffusion
## tts: enables text-to-speech with go-piper ## tts: enables text-to-speech with go-piper
## (requires REBUILD=true) ## (requires REBUILD=true)
# #
# GO_TAGS=p2p # GO_TAGS=stablediffusion
## Path where to store generated images ## Path where to store generated images
# LOCALAI_IMAGE_PATH=/tmp/generated/images # LOCALAI_IMAGE_PATH=/tmp/generated/images
@ -76,7 +73,7 @@
### Define a list of GRPC Servers for llama-cpp workers to distribute the load ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
# https://github.com/ggerganov/llama.cpp/pull/6829 # https://github.com/ggerganov/llama.cpp/pull/6829
# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md # https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
# LLAMACPP_GRPC_SERVERS="" # LLAMACPP_GRPC_SERVERS=""
### Enable to run parallel requests ### Enable to run parallel requests
@ -85,15 +82,6 @@
# Enable to allow p2p mode # Enable to allow p2p mode
# LOCALAI_P2P=true # LOCALAI_P2P=true
# Enable to use federated mode
# LOCALAI_FEDERATED=true
# Enable to start federation server
# FEDERATED_SERVER=true
# Define to use federation token
# TOKEN=""
### Watchdog settings ### Watchdog settings
### ###
# Enables watchdog to kill backends that are inactive for too much time # Enables watchdog to kill backends that are inactive for too much time

View file

@ -29,6 +29,10 @@ updates:
schedule: schedule:
# Check for updates to GitHub Actions every weekday # Check for updates to GitHub Actions every weekday
interval: "weekly" interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/autogptq"
schedule:
interval: "weekly"
- package-ecosystem: "pip" - package-ecosystem: "pip"
directory: "/backend/python/bark" directory: "/backend/python/bark"
schedule: schedule:
@ -77,6 +81,14 @@ updates:
directory: "/backend/python/transformers" directory: "/backend/python/transformers"
schedule: schedule:
interval: "weekly" interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/transformers-musicgen"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/backend/python/vall-e-x"
schedule:
interval: "weekly"
- package-ecosystem: "pip" - package-ecosystem: "pip"
directory: "/backend/python/vllm" directory: "/backend/python/vllm"
schedule: schedule:

6
.github/labeler.yml vendored
View file

@ -1,14 +1,10 @@
enhancement: enhancements:
- head-branch: ['^feature', 'feature'] - head-branch: ['^feature', 'feature']
dependencies: dependencies:
- any: - any:
- changed-files: - changed-files:
- any-glob-to-any-file: 'Makefile' - any-glob-to-any-file: 'Makefile'
- changed-files:
- any-glob-to-any-file: '*.mod'
- changed-files:
- any-glob-to-any-file: '*.sum'
kind/documentation: kind/documentation:
- any: - any:

View file

@ -9,10 +9,10 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- repository: "ggml-org/llama.cpp" - repository: "ggerganov/llama.cpp"
variable: "CPPLLAMA_VERSION" variable: "CPPLLAMA_VERSION"
branch: "master" branch: "master"
- repository: "ggml-org/whisper.cpp" - repository: "ggerganov/whisper.cpp"
variable: "WHISPER_CPP_VERSION" variable: "WHISPER_CPP_VERSION"
branch: "master" branch: "master"
- repository: "PABannier/bark.cpp" - repository: "PABannier/bark.cpp"

View file

@ -14,7 +14,7 @@ jobs:
steps: steps:
- name: Dependabot metadata - name: Dependabot metadata
id: metadata id: metadata
uses: dependabot/fetch-metadata@v2.4.0 uses: dependabot/fetch-metadata@v2.2.0
with: with:
github-token: "${{ secrets.GITHUB_TOKEN }}" github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true skip-commit-verification: true

View file

@ -33,7 +33,7 @@ jobs:
run: | run: |
CGO_ENABLED=0 make build-api CGO_ENABLED=0 make build-api
- name: rm - name: rm
uses: appleboy/ssh-action@v1.2.2 uses: appleboy/ssh-action@v1.2.0
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -42,7 +42,7 @@ jobs:
script: | script: |
sudo rm -rf local-ai/ || true sudo rm -rf local-ai/ || true
- name: copy file via ssh - name: copy file via ssh
uses: appleboy/scp-action@v1.0.0 uses: appleboy/scp-action@v0.1.7
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@ -53,7 +53,7 @@ jobs:
rm: true rm: true
target: ./local-ai target: ./local-ai
- name: restarting - name: restarting
uses: appleboy/ssh-action@v1.2.2 uses: appleboy/ssh-action@v1.2.0
with: with:
host: ${{ secrets.EXPLORER_SSH_HOST }} host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }} username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View file

@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
on: on:
workflow_dispatch: workflow_dispatch:
push:
schedule: branches:
# daily at midnight - master
- cron: '0 0 * * *'
concurrency: concurrency:
group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }} group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@ -17,7 +16,7 @@ jobs:
matrix: matrix:
include: include:
- grpc-base-image: ubuntu:22.04 - grpc-base-image: ubuntu:22.04
runs-on: 'arc-runner-set' runs-on: 'ubuntu-latest'
platforms: 'linux/amd64,linux/arm64' platforms: 'linux/amd64,linux/arm64'
runs-on: ${{matrix.runs-on}} runs-on: ${{matrix.runs-on}}
steps: steps:

View file

@ -15,7 +15,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
runs-on: 'ubuntu-latest' runs-on: 'ubuntu-latest'
platforms: 'linux/amd64' platforms: 'linux/amd64'
runs-on: ${{matrix.runs-on}} runs-on: ${{matrix.runs-on}}

View file

@ -33,7 +33,6 @@ jobs:
# Pushing with all jobs in parallel # Pushing with all jobs in parallel
# eats the bandwidth of all the nodes # eats the bandwidth of all the nodes
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
fail-fast: false
matrix: matrix:
include: include:
# This is basically covered by the AIO test # This is basically covered by the AIO test
@ -57,35 +56,26 @@ jobs:
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas' # - build-type: 'hipblas'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
tag-suffix: '-hipblas' # tag-suffix: '-hipblas'
ffmpeg: 'false' # ffmpeg: 'false'
image-type: 'extras' # image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1" # base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' # - build-type: 'sycl_f16'
platforms: 'linux/amd64' # platforms: 'linux/amd64'
tag-latest: 'false' # tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" # base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" # grpc-base-image: "ubuntu:22.04"
tag-suffix: 'sycl-f16-ffmpeg' # tag-suffix: 'sycl-f16-ffmpeg'
ffmpeg: 'true' # ffmpeg: 'true'
image-type: 'extras' # image-type: 'extras'
runs-on: 'arc-runner-set' # runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" # makeflags: "--jobs=3 --output-sync=target"
- build-type: 'vulkan'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-vulkan-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
# core-image-build: # core-image-build:
# uses: ./.github/workflows/image_build.yml # uses: ./.github/workflows/image_build.yml
# with: # with:

View file

@ -45,13 +45,13 @@ jobs:
- build-type: 'hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '-hipblas-extras' tag-suffix: '-hipblas-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
aio: "-aio-gpu-hipblas" aio: "-aio-gpu-hipblas"
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
latest-image: 'latest-gpu-hipblas-extras' latest-image: 'latest-gpu-hipblas'
latest-image-aio: 'latest-aio-gpu-hipblas' latest-image-aio: 'latest-aio-gpu-hipblas'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
@ -59,13 +59,32 @@ jobs:
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-hipblas' tag-suffix: '-hipblas'
ffmpeg: 'false'
image-type: 'extras'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1" base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-hipblas' - build-type: 'hipblas'
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-hipblas-core'
ffmpeg: 'false'
image-type: 'core'
base-image: "rocm/dev-ubuntu-22.04:6.1"
grpc-base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
self-hosted-jobs: self-hosted-jobs:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
with: with:
@ -95,58 +114,110 @@ jobs:
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }} max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
matrix: matrix:
include: include:
# Extra images
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "11" cuda-major-version: "11"
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11-extras' tag-suffix: '-cublas-cuda11'
ffmpeg: 'true' ffmpeg: ''
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11-extras'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12-extras' tag-suffix: '-cublas-cuda12'
ffmpeg: ''
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda11-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-11"
latest-image: 'latest-gpu-nvidia-cuda-11'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
aio: "-aio-gpu-nvidia-cuda-12" aio: "-aio-gpu-nvidia-cuda-12"
latest-image: 'latest-gpu-nvidia-cuda-12-extras' latest-image: 'latest-gpu-nvidia-cuda-12'
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12' latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: ''
#platforms: 'linux/amd64,linux/arm64'
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: ''
ffmpeg: ''
image-type: 'extras'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16' - build-type: 'sycl_f16'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-extras' tag-suffix: '-sycl-f16-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f16" aio: "-aio-gpu-intel-f16"
latest-image: 'latest-gpu-intel-f16-extras' latest-image: 'latest-gpu-intel-f16'
latest-image-aio: 'latest-aio-gpu-intel-f16' latest-image-aio: 'latest-aio-gpu-intel-f16'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'auto'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-extras' tag-suffix: '-sycl-f32-ffmpeg'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'extras' image-type: 'extras'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
aio: "-aio-gpu-intel-f32" aio: "-aio-gpu-intel-f32"
latest-image: 'latest-gpu-intel-f32-extras' latest-image: 'latest-gpu-intel-f32'
latest-image-aio: 'latest-aio-gpu-intel-f32' latest-image-aio: 'latest-aio-gpu-intel-f32'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
# Core images # Core images
@ -155,23 +226,41 @@ jobs:
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16' tag-suffix: '-sycl-f16-core'
ffmpeg: 'true' ffmpeg: 'false'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f16'
- build-type: 'sycl_f32' - build-type: 'sycl_f32'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest" base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04" grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32' tag-suffix: '-sycl-f32-core'
ffmpeg: 'false'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f16'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f16-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target"
- build-type: 'sycl_f32'
platforms: 'linux/amd64'
tag-latest: 'false'
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
grpc-base-image: "ubuntu:22.04"
tag-suffix: '-sycl-f32-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
makeflags: "--jobs=3 --output-sync=target" makeflags: "--jobs=3 --output-sync=target"
latest-image: 'latest-gpu-intel-f32'
core-image-build: core-image-build:
uses: ./.github/workflows/image_build.yml uses: ./.github/workflows/image_build.yml
@ -191,7 +280,6 @@ jobs:
makeflags: ${{ matrix.makeflags }} makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }} latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }} latest-image-aio: ${{ matrix.latest-image-aio }}
skip-drivers: ${{ matrix.skip-drivers }}
secrets: secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }} dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }} dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@ -204,7 +292,7 @@ jobs:
- build-type: '' - build-type: ''
platforms: 'linux/amd64,linux/arm64' platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto' tag-latest: 'auto'
tag-suffix: '' tag-suffix: '-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
@ -213,81 +301,57 @@ jobs:
latest-image: 'latest-cpu' latest-image: 'latest-cpu'
latest-image-aio: 'latest-aio-cpu' latest-image-aio: 'latest-aio-cpu'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "11" cuda-major-version: "11"
cuda-minor-version: "7" cuda-minor-version: "7"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda11' tag-suffix: '-cublas-cuda11-core'
ffmpeg: 'true' ffmpeg: ''
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'false'
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'cublas' - build-type: 'cublas'
cuda-major-version: "12" cuda-major-version: "12"
cuda-minor-version: "0" cuda-minor-version: "0"
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-cublas-cuda12' tag-suffix: '-cublas-cuda12-core'
ffmpeg: ''
image-type: 'core'
base-image: "ubuntu:22.04"
runs-on: 'arc-runner-set'
makeflags: "--jobs=4 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "11"
cuda-minor-version: "7"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda11-ffmpeg-core'
ffmpeg: 'true'
image-type: 'core'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=4 --output-sync=target"
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'false'
tag-suffix: '-cublas-cuda12-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-nvidia-cuda-12'
- build-type: 'vulkan' - build-type: 'vulkan'
platforms: 'linux/amd64' platforms: 'linux/amd64'
tag-latest: 'false' tag-latest: 'false'
tag-suffix: '-vulkan' tag-suffix: '-vulkan-ffmpeg-core'
latest-image: 'latest-vulkan-ffmpeg-core'
ffmpeg: 'true' ffmpeg: 'true'
image-type: 'core' image-type: 'core'
runs-on: 'arc-runner-set' runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04" base-image: "ubuntu:22.04"
skip-drivers: 'false'
makeflags: "--jobs=4 --output-sync=target" makeflags: "--jobs=4 --output-sync=target"
latest-image: 'latest-gpu-vulkan'
gh-runner:
uses: ./.github/workflows/image_build.yml
with:
tag-latest: ${{ matrix.tag-latest }}
tag-suffix: ${{ matrix.tag-suffix }}
ffmpeg: ${{ matrix.ffmpeg }}
image-type: ${{ matrix.image-type }}
build-type: ${{ matrix.build-type }}
cuda-major-version: ${{ matrix.cuda-major-version }}
cuda-minor-version: ${{ matrix.cuda-minor-version }}
platforms: ${{ matrix.platforms }}
runs-on: ${{ matrix.runs-on }}
aio: ${{ matrix.aio }}
base-image: ${{ matrix.base-image }}
grpc-base-image: ${{ matrix.grpc-base-image }}
makeflags: ${{ matrix.makeflags }}
latest-image: ${{ matrix.latest-image }}
latest-image-aio: ${{ matrix.latest-image-aio }}
skip-drivers: ${{ matrix.skip-drivers }}
secrets:
dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
strategy:
matrix:
include:
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'false'
tag-suffix: '-nvidia-l4t-arm64'
latest-image: 'latest-nvidia-l4t-arm64'
ffmpeg: 'true'
image-type: 'core'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
makeflags: "--jobs=4 --output-sync=target"
skip-drivers: 'true'

View file

@ -49,10 +49,6 @@ on:
description: 'FFMPEG' description: 'FFMPEG'
default: '' default: ''
type: string type: string
skip-drivers:
description: 'Skip drivers by default'
default: 'false'
type: string
image-type: image-type:
description: 'Image type' description: 'Image type'
default: '' default: ''
@ -238,7 +234,6 @@ jobs:
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
GRPC_VERSION=v1.65.0 GRPC_VERSION=v1.65.0
MAKEFLAGS=${{ inputs.makeflags }} MAKEFLAGS=${{ inputs.makeflags }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
context: . context: .
file: ./Dockerfile file: ./Dockerfile
cache-from: type=gha cache-from: type=gha
@ -267,7 +262,6 @@ jobs:
GRPC_MAKEFLAGS=--jobs=4 --output-sync=target GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
GRPC_VERSION=v1.65.0 GRPC_VERSION=v1.65.0
MAKEFLAGS=${{ inputs.makeflags }} MAKEFLAGS=${{ inputs.makeflags }}
SKIP_DRIVERS=${{ inputs.skip-drivers }}
context: . context: .
file: ./Dockerfile file: ./Dockerfile
cache-from: type=gha cache-from: type=gha
@ -310,11 +304,6 @@ jobs:
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }} tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }} labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
- name: Cleanup
run: |
docker builder prune -f
docker system prune --force --volumes --all
- name: Latest tag - name: Latest tag
# run this on branches, when it is a tag and there is a latest-image defined # run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag' if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'

View file

@ -8,7 +8,7 @@ jobs:
notify-discord: notify-discord:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -16,9 +16,9 @@ jobs:
fetch-depth: 0 # needed to checkout all branches for this Action to work fetch-depth: 0 # needed to checkout all branches for this Action to work
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0 - uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action id: git-diff-action
with: with:
json_diff_file_output: diff.json json_diff_file_output: diff.json
@ -79,7 +79,7 @@ jobs:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -87,7 +87,7 @@ jobs:
notify-twitter: notify-twitter:
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }} if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
env: env:
MODEL_NAME: gemma-3-12b-it MODEL_NAME: hermes-2-theta-llama-3-8b
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -99,7 +99,7 @@ jobs:
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
# Check the PR diff using the current branch and the base branch of the PR # Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0 - uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action id: git-diff-action
with: with:
json_diff_file_output: diff.json json_diff_file_output: diff.json
@ -161,7 +161,7 @@ jobs:
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
- name: Setup tmate session if fails - name: Setup tmate session if fails
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View file

@ -14,7 +14,7 @@ jobs:
steps: steps:
- uses: mudler/localai-github-action@v1 - uses: mudler/localai-github-action@v1
with: with:
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file" model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
- name: Summarize - name: Summarize
id: summarize id: summarize
run: | run: |
@ -60,4 +60,4 @@ jobs:
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4" DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
uses: Ilshidur/action-discord@master uses: Ilshidur/action-discord@master
with: with:
args: ${{ steps.summarize.outputs.message }} args: ${{ steps.summarize.outputs.message }}

View file

@ -36,7 +36,6 @@ jobs:
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
make install-go-tools
- name: Install CUDA Dependencies - name: Install CUDA Dependencies
run: | run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@ -124,7 +123,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -152,7 +151,6 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
make install-go-tools
- name: Intel Dependencies - name: Intel Dependencies
run: | run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@ -234,12 +232,45 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
limit-access-to-actor: true limit-access-to-actor: true
build-stablediffusion:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- uses: actions/setup-go@v5
with:
go-version: '1.21.x'
cache: false
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build stablediffusion
run: |
export PATH=$PATH:$GOPATH/bin
make backend-assets/grpc/stablediffusion
mkdir -p release && cp backend-assets/grpc/stablediffusion release
env:
GO_TAGS: stablediffusion
- uses: actions/upload-artifact@v4
with:
name: stablediffusion
path: release/
- name: Release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
with:
files: |
release/*
build-macOS-x86_64: build-macOS-x86_64:
runs-on: macos-13 runs-on: macos-13
@ -255,7 +286,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc brew install protobuf grpc
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
- name: Build - name: Build
id: build id: build
run: | run: |
@ -276,7 +308,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -296,7 +328,8 @@ jobs:
- name: Dependencies - name: Dependencies
run: | run: |
brew install protobuf grpc libomp llvm brew install protobuf grpc libomp llvm
make install-go-tools go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
- name: Build - name: Build
id: build id: build
run: | run: |
@ -317,7 +350,7 @@ jobs:
release/* release/*
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

View file

@ -18,7 +18,7 @@ jobs:
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
- name: Run Gosec Security Scanner - name: Run Gosec Security Scanner
if: ${{ github.actor != 'dependabot[bot]' }} if: ${{ github.actor != 'dependabot[bot]' }}
uses: securego/gosec@v2.22.4 uses: securego/gosec@v2.21.4
with: with:
# we let the report trigger content trigger a failure using the GitHub Security features. # we let the report trigger content trigger a failure using the GitHub Security features.
args: '-no-fail -fmt sarif -out results.sarif ./...' args: '-no-fail -fmt sarif -out results.sarif ./...'

View file

@ -35,6 +35,30 @@ jobs:
run: | run: |
make --jobs=5 --output-sync=target -C backend/python/transformers make --jobs=5 --output-sync=target -C backend/python/transformers
make --jobs=5 --output-sync=target -C backend/python/transformers test make --jobs=5 --output-sync=target -C backend/python/transformers test
tests-sentencetransformers:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test sentencetransformers
run: |
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
tests-rerankers: tests-rerankers:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -78,47 +102,78 @@ jobs:
make --jobs=5 --output-sync=target -C backend/python/diffusers make --jobs=5 --output-sync=target -C backend/python/diffusers
make --jobs=5 --output-sync=target -C backend/python/diffusers test make --jobs=5 --output-sync=target -C backend/python/diffusers test
#tests-vllm: tests-parler-tts:
# runs-on: ubuntu-latest runs-on: ubuntu-latest
# steps: steps:
# - name: Clone - name: Clone
# uses: actions/checkout@v4 uses: actions/checkout@v4
# with: with:
# submodules: true submodules: true
# - name: Dependencies - name: Dependencies
# run: | run: |
# sudo apt-get update sudo apt-get update
# sudo apt-get install -y build-essential ffmpeg sudo apt-get install build-essential ffmpeg
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip # Install UV
# sudo apt-get install -y libopencv-dev curl -LsSf https://astral.sh/uv/install.sh | sh
# # Install UV sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# curl -LsSf https://astral.sh/uv/install.sh | sh sudo apt-get install -y libopencv-dev
# pip install --user --no-cache-dir grpcio-tools==1.64.1 pip install --user --no-cache-dir grpcio-tools==1.64.1
# - name: Test vllm backend
# run: |
# make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test
# tests-transformers-musicgen:
# runs-on: ubuntu-latest
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# submodules: true
# - name: Dependencies
# run: |
# sudo apt-get update
# sudo apt-get install build-essential ffmpeg
# # Install UV
# curl -LsSf https://astral.sh/uv/install.sh | sh
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
# sudo apt-get install -y libopencv-dev
# pip install --user --no-cache-dir grpcio-tools==1.64.1
# - name: Test transformers-musicgen - name: Test parler-tts
# run: | run: |
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen make --jobs=5 --output-sync=target -C backend/python/parler-tts
# make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test make --jobs=5 --output-sync=target -C backend/python/parler-tts test
- name: Setup tmate session if tests fail
if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.19
with:
detached: true
connect-timeout-seconds: 180
limit-access-to-actor: true
tests-openvoice:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test openvoice
run: |
make --jobs=5 --output-sync=target -C backend/python/openvoice
make --jobs=5 --output-sync=target -C backend/python/openvoice test
tests-transformers-musicgen:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test transformers-musicgen
run: |
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
# tests-bark: # tests-bark:
# runs-on: ubuntu-latest # runs-on: ubuntu-latest
@ -205,6 +260,26 @@ jobs:
# run: | # run: |
# make --jobs=5 --output-sync=target -C backend/python/vllm # make --jobs=5 --output-sync=target -C backend/python/vllm
# make --jobs=5 --output-sync=target -C backend/python/vllm test # make --jobs=5 --output-sync=target -C backend/python/vllm test
tests-vallex:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg
# Install UV
curl -LsSf https://astral.sh/uv/install.sh | sh
sudo apt-get install -y ca-certificates cmake curl patch python3-pip
sudo apt-get install -y libopencv-dev
pip install --user --no-cache-dir grpcio-tools==1.64.1
- name: Test vall-e-x
run: |
make --jobs=5 --output-sync=target -C backend/python/vall-e-x
make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
tests-coqui: tests-coqui:
runs-on: ubuntu-latest runs-on: ubuntu-latest

View file

@ -71,7 +71,7 @@ jobs:
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
sudo apt-get install -y libgmock-dev clang sudo apt-get install -y libgmock-dev
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@ -96,17 +96,19 @@ jobs:
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
# The python3-grpc-tools package in 22.04 is too old # The python3-grpc-tools package in 22.04 is too old
pip install --user grpcio-tools pip install --user grpcio-tools
make -C backend/python/transformers sudo rm -rfv /usr/bin/conda || true
PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
# Pre-build piper before we start tests in order to have shared libraries in place # Pre-build piper before we start tests in order to have shared libraries in place
make sources/go-piper && \ make sources/go-piper && \
GO_TAGS="tts" make -C sources/go-piper piper.o && \ GO_TAGS="tts" make -C sources/go-piper piper.o && \
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
env: env:
CUDA_VERSION: 12-4 CUDA_VERSION: 12-4
- name: Cache grpc - name: Cache grpc
@ -128,10 +130,10 @@ jobs:
cd grpc && cd cmake/build && sudo make --jobs 5 install cd grpc && cd cmake/build && sudo make --jobs 5 install
- name: Test - name: Test
run: | run: |
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -184,7 +186,6 @@ jobs:
rm protoc.zip rm protoc.zip
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
PATH="$PATH:$HOME/go/bin" make protogen-go PATH="$PATH:$HOME/go/bin" make protogen-go
- name: Build images - name: Build images
run: | run: |
@ -196,7 +197,7 @@ jobs:
make run-e2e-aio make run-e2e-aio
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180
@ -224,7 +225,6 @@ jobs:
run: | run: |
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
pip install --user --no-cache-dir grpcio-tools pip install --user --no-cache-dir grpcio-tools
go install github.com/GeertJohan/go.rice/rice@latest
- name: Test - name: Test
run: | run: |
export C_INCLUDE_PATH=/usr/local/include export C_INCLUDE_PATH=/usr/local/include
@ -235,7 +235,7 @@ jobs:
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
- name: Setup tmate session if tests fail - name: Setup tmate session if tests fail
if: ${{ failure() }} if: ${{ failure() }}
uses: mxschmitt/action-tmate@v3.22 uses: mxschmitt/action-tmate@v3.19
with: with:
detached: true detached: true
connect-timeout-seconds: 180 connect-timeout-seconds: 180

2
.vscode/launch.json vendored
View file

@ -26,7 +26,7 @@
"LOCALAI_P2P": "true", "LOCALAI_P2P": "true",
"LOCALAI_FEDERATED": "true" "LOCALAI_FEDERATED": "true"
}, },
"buildFlags": ["-tags", "p2p tts", "-v"], "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
"envFile": "${workspaceFolder}/.env", "envFile": "${workspaceFolder}/.env",
"cwd": "${workspaceRoot}" "cwd": "${workspaceRoot}"
} }

View file

@ -15,7 +15,8 @@ ARG TARGETARCH
ARG TARGETVARIANT ARG TARGETVARIANT
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh" ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
@ -24,7 +25,6 @@ RUN apt-get update && \
ca-certificates \ ca-certificates \
curl libssl-dev \ curl libssl-dev \
git \ git \
git-lfs \
unzip upx-ucl && \ unzip upx-ucl && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
@ -46,10 +46,9 @@ EOT
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
# Install grpc compilers and rice # Install grpc compilers
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install github.com/GeertJohan/go.rice/rice@latest
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/ COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
RUN update-ca-certificates RUN update-ca-certificates
@ -70,10 +69,14 @@ ENV PATH=/opt/rocm/bin:${PATH}
# OpenBLAS requirements and stable diffusion # OpenBLAS requirements and stable diffusion
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
libopenblas-dev && \ libopenblas-dev \
libopencv-dev && \
apt-get clean && \ apt-get clean && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# Set up OpenCV
RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
WORKDIR /build WORKDIR /build
################################### ###################################
@ -112,13 +115,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
ARG BUILD_TYPE ARG BUILD_TYPE
ARG CUDA_MAJOR_VERSION=12 ARG CUDA_MAJOR_VERSION=12
ARG CUDA_MINOR_VERSION=0 ARG CUDA_MINOR_VERSION=0
ARG SKIP_DRIVERS=false
ENV BUILD_TYPE=${BUILD_TYPE} ENV BUILD_TYPE=${BUILD_TYPE}
# Vulkan requirements # Vulkan requirements
RUN <<EOT bash RUN <<EOT bash
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then if [ "${BUILD_TYPE}" = "vulkan" ]; then
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
software-properties-common pciutils wget gpg-agent && \ software-properties-common pciutils wget gpg-agent && \
@ -134,7 +136,7 @@ EOT
# CuBLAS requirements # CuBLAS requirements
RUN <<EOT bash RUN <<EOT bash
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then if [ "${BUILD_TYPE}" = "cublas" ]; then
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
software-properties-common pciutils software-properties-common pciutils
@ -160,7 +162,7 @@ RUN <<EOT bash
EOT EOT
# If we are building with clblas support, we need the libraries for the builds # If we are building with clblas support, we need the libraries for the builds
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \ RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
libclblast-dev && \ libclblast-dev && \
@ -168,7 +170,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
rm -rf /var/lib/apt/lists/* \ rm -rf /var/lib/apt/lists/* \
; fi ; fi
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
apt-get update && \ apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
hipblas-dev \ hipblas-dev \
@ -248,7 +250,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
FROM requirements-drivers AS builder-base FROM requirements-drivers AS builder-base
ARG GO_TAGS="tts p2p" ARG GO_TAGS="stablediffusion tts p2p"
ARG GRPC_BACKENDS ARG GRPC_BACKENDS
ARG MAKEFLAGS ARG MAKEFLAGS
ARG LD_FLAGS="-s -w" ARG LD_FLAGS="-s -w"
@ -282,12 +284,35 @@ RUN <<EOT bash
fi fi
EOT EOT
###################################
###################################
# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
FROM builder-base AS builder-sd
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
COPY Makefile .
COPY go.mod .
COPY go.sum .
COPY backend/backend.proto ./backend/backend.proto
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
COPY pkg/grpc ./pkg/grpc
COPY pkg/stablediffusion ./pkg/stablediffusion
RUN git init
RUN make sources/go-stable-diffusion
RUN touch prepare-sources
# Actually build the backend
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
################################### ###################################
################################### ###################################
# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry. # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
# Adjustments to the build process should likely be made here. # Adjustments to the build process should likely be made here.
FROM builder-base AS builder FROM builder-sd AS builder
# Install the pre-built GRPC # Install the pre-built GRPC
COPY --from=grpc /opt/grpc /usr/local COPY --from=grpc /opt/grpc /usr/local
@ -301,10 +326,11 @@ COPY .git .
RUN make prepare RUN make prepare
## Build the binary ## Build the binary
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space ## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
## Otherwise just run the normal build ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \ ## (both will use CUDA or hipblas for the actual computation)
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \ RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
else \ else \
make build; \ make build; \
fi fi
@ -326,6 +352,8 @@ ARG FFMPEG
COPY --from=grpc /opt/grpc /usr/local COPY --from=grpc /opt/grpc /usr/local
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
COPY .devcontainer-scripts /.devcontainer-scripts COPY .devcontainer-scripts /.devcontainer-scripts
# Add FFmpeg # Add FFmpeg
@ -398,28 +426,36 @@ COPY --from=builder /build/local-ai ./
# Copy shared libraries for piper # Copy shared libraries for piper
COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/ COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
# do not let stablediffusion rebuild (requires an older version of absl)
COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
# Change the shell to bash so we can use [[ tests below # Change the shell to bash so we can use [[ tests below
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
# We try to strike a balance between individual layer size (as that affects total push time) and total image size # We try to strike a balance between individual layer size (as that affects total push time) and total image size
# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
apt-get -qq -y install espeak-ng \
; fi
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/coqui \ make -C backend/python/coqui \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/faster-whisper \ make -C backend/python/parler-tts \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/diffusers \ make -C backend/python/diffusers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/transformers-musicgen \
; fi ; fi
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/kokoro \ make -C backend/python/vall-e-x \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/openvoice \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/sentencetransformers \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/exllama2 \ make -C backend/python/exllama2 \
@ -431,11 +467,17 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/vllm \ make -C backend/python/vllm \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/autogptq \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/bark \ make -C backend/python/bark \
; fi && \ ; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \ if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/rerankers \ make -C backend/python/rerankers \
; fi && \
if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
make -C backend/python/mamba \
; fi ; fi
# Make sure the models directory exists # Make sure the models directory exists

View file

@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io) Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

296
Makefile
View file

@ -6,26 +6,33 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
CPPLLAMA_VERSION?=6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=cc98896db858df7aa40d0e16a505883ef196a482
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=d1f114da61b1ae1e70b03104fad42c9dd666feeb WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
# go-piper version # go-piper version
PIPER_REPO?=https://github.com/mudler/go-piper PIPER_REPO?=https://github.com/mudler/go-piper
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0 PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
# stablediffusion version
STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
# tinydream version
TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
# bark.cpp # bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0 BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml) # stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae STABLEDIFFUSION_GGML_VERSION?=4570715727f35e5a07a76796d823824c8f42206c
# ONEAPI variables for SYCL
export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
ONNX_VERSION?=1.20.0 ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64 ONNX_ARCH?=x64
@ -33,12 +40,8 @@ ONNX_OS?=linux
export BUILD_TYPE?= export BUILD_TYPE?=
export STABLE_BUILD_TYPE?=$(BUILD_TYPE) export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF export CMAKE_ARGS?=
export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
export BACKEND_LIBS?= export BACKEND_LIBS?=
export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
CGO_LDFLAGS?= CGO_LDFLAGS?=
CGO_LDFLAGS_WHISPER?= CGO_LDFLAGS_WHISPER?=
@ -88,7 +91,6 @@ endif
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
ifeq ($(NATIVE),false) ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF CMAKE_ARGS+=-DGGML_NATIVE=OFF
WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif endif
# Detect if we are running on arm64 # Detect if we are running on arm64
@ -116,31 +118,13 @@ ifeq ($(OS),Darwin)
# disable metal if on Darwin and any other value is explicitly passed. # disable metal if on Darwin and any other value is explicitly passed.
else ifneq ($(BUILD_TYPE),metal) else ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DGGML_METAL=OFF CMAKE_ARGS+=-DGGML_METAL=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
export GGML_NO_ACCELERATE=1 export GGML_NO_ACCELERATE=1
export GGML_NO_METAL=1 export GGML_NO_METAL=1
GO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
ifeq ($(BUILD_TYPE),metal) ifeq ($(BUILD_TYPE),metal)
# -lcblas removed: it seems to always be listed as a duplicate flag.
CGO_LDFLAGS += -framework Accelerate CGO_LDFLAGS += -framework Accelerate
CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
CMAKE_ARGS+=-DGGML_OPENMP=OFF
WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
else
CGO_LDFLAGS_WHISPER+=-lggml-blas
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
endif endif
else else
CGO_LDFLAGS_WHISPER+=-lgomp CGO_LDFLAGS_WHISPER+=-lgomp
@ -152,29 +136,21 @@ ifeq ($(BUILD_TYPE),openblas)
endif endif
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
export GGML_CUDA=1 export GGML_CUDA=1
CMAKE_ARGS+=-DGGML_CUDA=ON CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
endif endif
ifeq ($(BUILD_TYPE),vulkan) ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DGGML_VULKAN=1 CMAKE_ARGS+=-DGGML_VULKAN=1
WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
endif endif
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
export GGML_SYCL=1 export GGML_SYCL=1
CMAKE_ARGS+=-DGGML_SYCL=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
export GGML_SYCL_F16=1 export GGML_SYCL_F16=1
CMAKE_ARGS+=-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),hipblas) ifeq ($(BUILD_TYPE),hipblas)
@ -183,9 +159,10 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE= export STABLE_BUILD_TYPE=
export GGML_HIP=1 export GGML_HIP=1
GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
AMDGPU_TARGETS ?= "$(GPU_TARGETS)" AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)" CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@ -206,6 +183,16 @@ ifeq ($(STATIC),true)
LD_FLAGS+=-linkmode external -extldflags -static LD_FLAGS+=-linkmode external -extldflags -static
endif endif
ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
# OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
endif
ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
# OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
OPTIONAL_GRPC+=backend-assets/grpc/tinydream
endif
ifeq ($(findstring tts,$(GO_TAGS)),tts) ifeq ($(findstring tts,$(GO_TAGS)),tts)
# OPTIONAL_TARGETS+=go-piper/libpiper_binding.a # OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
# OPTIONAL_TARGETS+=backend-assets/espeak-ng-data # OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@ -217,8 +204,8 @@ endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@ -252,6 +239,19 @@ endif
all: help all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp ## bark.cpp
sources/bark.cpp: sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@ -282,6 +282,19 @@ sources/go-piper:
sources/go-piper/libpiper_binding.a: sources/go-piper sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o $(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
## stable diffusion (onnx)
sources/go-stable-diffusion:
mkdir -p sources/go-stable-diffusion
cd sources/go-stable-diffusion && \
git init && \
git remote add origin $(STABLEDIFFUSION_REPO) && \
git fetch origin && \
git checkout $(STABLEDIFFUSION_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
## stablediffusion (ggml) ## stablediffusion (ggml)
sources/stablediffusion-ggml.cpp: sources/stablediffusion-ggml.cpp:
git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \ git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
@ -289,12 +302,22 @@ sources/stablediffusion-ggml.cpp:
git checkout $(STABLEDIFFUSION_GGML_VERSION) && \ git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a cd sources/stablediffusion-ggml.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a $(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion-ggml
endif
sources/onnxruntime: sources/onnxruntime:
mkdir -p sources/onnxruntime mkdir -p sources/onnxruntime
@ -310,6 +333,19 @@ else
mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1 mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
endif endif
## tiny-dream
sources/go-tiny-dream:
mkdir -p sources/go-tiny-dream
cd sources/go-tiny-dream && \
git init && \
git remote add origin $(TINYDREAM_REPO) && \
git fetch origin && \
git checkout $(TINYDREAM_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
$(MAKE) -C sources/go-tiny-dream libtinydream.a
## whisper ## whisper
sources/whisper.cpp: sources/whisper.cpp:
mkdir -p sources/whisper.cpp mkdir -p sources/whisper.cpp
@ -320,21 +356,26 @@ sources/whisper.cpp:
git checkout $(WHISPER_CPP_VERSION) && \ git checkout $(WHISPER_CPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
cd sources/whisper.cpp/build && cmake --build . --config Release
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
replace: replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace: dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace prepare-sources: get-sources replace
$(GOCMD) mod download $(GOCMD) mod download
@ -342,8 +383,11 @@ prepare-sources: get-sources replace
## GENERIC ## GENERIC
rebuild: ## Rebuilds the project rebuild: ## Rebuilds the project
$(GOCMD) clean -cache $(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-stable-diffusion clean
$(MAKE) -C sources/go-piper clean $(MAKE) -C sources/go-piper clean
$(MAKE) -C sources/go-tiny-dream clean
$(MAKE) build $(MAKE) build
prepare: prepare-sources $(OPTIONAL_TARGETS) prepare: prepare-sources $(OPTIONAL_TARGETS)
@ -372,14 +416,8 @@ clean-tests:
clean-dc: clean clean-dc: clean
cp -r /build/backend-assets /workspace/backend-assets cp -r /build/backend-assets /workspace/backend-assets
## Install Go tools
install-go-tools:
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
go install github.com/GeertJohan/go.rice/rice@latest
## Build: ## Build:
build: prepare backend-assets grpcs install-go-tools ## Build the project build: prepare backend-assets grpcs ## Build the project
$(info ${GREEN}I local-ai build info:${RESET}) $(info ${GREEN}I local-ai build info:${RESET})
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET}) $(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET}) $(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@ -389,9 +427,7 @@ ifneq ($(BACKEND_LIBS),)
$(MAKE) backend-assets/lib $(MAKE) backend-assets/lib
cp -f $(BACKEND_LIBS) backend-assets/lib/ cp -f $(BACKEND_LIBS) backend-assets/lib/
endif endif
rm -rf $(BINARY_NAME) || true
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./ CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
rice append --exec $(BINARY_NAME)
build-minimal: build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@ -453,7 +489,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml: test-models/testmodel.ggml:
mkdir test-models mkdir test-models
mkdir test-dir mkdir test-dir
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -463,13 +499,13 @@ prepare-test: grpcs
cp -rf backend-assets core/http cp -rf backend-assets core/http
cp tests/models_fixtures/* test-models cp tests/models_fixtures/* test-models
## Test targets
test: prepare test-models/testmodel.ggml grpcs test: prepare test-models/testmodel.ggml grpcs
@echo 'Running tests' @echo 'Running tests'
export GO_TAGS="tts debug" export GO_TAGS="tts stablediffusion debug"
$(MAKE) prepare-test $(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf $(MAKE) test-llama-gguf
$(MAKE) test-tts $(MAKE) test-tts
$(MAKE) test-stablediffusion $(MAKE) test-stablediffusion
@ -498,6 +534,10 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests) docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@ -538,7 +578,7 @@ protogen: protogen-go protogen-python
protogen-clean: protogen-go-clean protogen-python-clean protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go .PHONY: protogen-go
protogen-go: install-go-tools protogen-go:
mkdir -p pkg/grpc/proto mkdir -p pkg/grpc/proto
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \ protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto backend/backend.proto
@ -549,10 +589,18 @@ protogen-go-clean:
$(RM) bin/* $(RM) bin/*
.PHONY: protogen-python .PHONY: protogen-python
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
.PHONY: protogen-python-clean .PHONY: protogen-python-clean
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
.PHONY: autogptq-protogen
autogptq-protogen:
$(MAKE) -C backend/python/autogptq protogen
.PHONY: autogptq-protogen-clean
autogptq-protogen-clean:
$(MAKE) -C backend/python/autogptq protogen-clean
.PHONY: bark-protogen .PHONY: bark-protogen
bark-protogen: bark-protogen:
@ -578,14 +626,6 @@ diffusers-protogen:
diffusers-protogen-clean: diffusers-protogen-clean:
$(MAKE) -C backend/python/diffusers protogen-clean $(MAKE) -C backend/python/diffusers protogen-clean
.PHONY: faster-whisper-protogen
faster-whisper-protogen:
$(MAKE) -C backend/python/faster-whisper protogen
.PHONY: faster-whisper-protogen-clean
faster-whisper-protogen-clean:
$(MAKE) -C backend/python/faster-whisper protogen-clean
.PHONY: exllama2-protogen .PHONY: exllama2-protogen
exllama2-protogen: exllama2-protogen:
$(MAKE) -C backend/python/exllama2 protogen $(MAKE) -C backend/python/exllama2 protogen
@ -594,6 +634,14 @@ exllama2-protogen:
exllama2-protogen-clean: exllama2-protogen-clean:
$(MAKE) -C backend/python/exllama2 protogen-clean $(MAKE) -C backend/python/exllama2 protogen-clean
.PHONY: mamba-protogen
mamba-protogen:
$(MAKE) -C backend/python/mamba protogen
.PHONY: mamba-protogen-clean
mamba-protogen-clean:
$(MAKE) -C backend/python/mamba protogen-clean
.PHONY: rerankers-protogen .PHONY: rerankers-protogen
rerankers-protogen: rerankers-protogen:
$(MAKE) -C backend/python/rerankers protogen $(MAKE) -C backend/python/rerankers protogen
@ -602,6 +650,14 @@ rerankers-protogen:
rerankers-protogen-clean: rerankers-protogen-clean:
$(MAKE) -C backend/python/rerankers protogen-clean $(MAKE) -C backend/python/rerankers protogen-clean
.PHONY: sentencetransformers-protogen
sentencetransformers-protogen:
$(MAKE) -C backend/python/sentencetransformers protogen
.PHONY: sentencetransformers-protogen-clean
sentencetransformers-protogen-clean:
$(MAKE) -C backend/python/sentencetransformers protogen-clean
.PHONY: transformers-protogen .PHONY: transformers-protogen
transformers-protogen: transformers-protogen:
$(MAKE) -C backend/python/transformers protogen $(MAKE) -C backend/python/transformers protogen
@ -610,13 +666,37 @@ transformers-protogen:
transformers-protogen-clean: transformers-protogen-clean:
$(MAKE) -C backend/python/transformers protogen-clean $(MAKE) -C backend/python/transformers protogen-clean
.PHONY: kokoro-protogen .PHONY: parler-tts-protogen
kokoro-protogen: parler-tts-protogen:
$(MAKE) -C backend/python/kokoro protogen $(MAKE) -C backend/python/parler-tts protogen
.PHONY: kokoro-protogen-clean .PHONY: parler-tts-protogen-clean
kokoro-protogen-clean: parler-tts-protogen-clean:
$(MAKE) -C backend/python/kokoro protogen-clean $(MAKE) -C backend/python/parler-tts protogen-clean
.PHONY: transformers-musicgen-protogen
transformers-musicgen-protogen:
$(MAKE) -C backend/python/transformers-musicgen protogen
.PHONY: transformers-musicgen-protogen-clean
transformers-musicgen-protogen-clean:
$(MAKE) -C backend/python/transformers-musicgen protogen-clean
.PHONY: vall-e-x-protogen
vall-e-x-protogen:
$(MAKE) -C backend/python/vall-e-x protogen
.PHONY: vall-e-x-protogen-clean
vall-e-x-protogen-clean:
$(MAKE) -C backend/python/vall-e-x protogen-clean
.PHONY: openvoice-protogen
openvoice-protogen:
$(MAKE) -C backend/python/openvoice protogen
.PHONY: openvoice-protogen-clean
openvoice-protogen-clean:
$(MAKE) -C backend/python/openvoice protogen-clean
.PHONY: vllm-protogen .PHONY: vllm-protogen
vllm-protogen: vllm-protogen:
@ -629,25 +709,28 @@ vllm-protogen-clean:
## GRPC ## GRPC
# Note: it is duplicated in the Dockerfile # Note: it is duplicated in the Dockerfile
prepare-extra-conda-environments: protogen-python prepare-extra-conda-environments: protogen-python
$(MAKE) -C backend/python/autogptq
$(MAKE) -C backend/python/bark $(MAKE) -C backend/python/bark
$(MAKE) -C backend/python/coqui $(MAKE) -C backend/python/coqui
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/faster-whisper
$(MAKE) -C backend/python/vllm $(MAKE) -C backend/python/vllm
$(MAKE) -C backend/python/mamba
$(MAKE) -C backend/python/sentencetransformers
$(MAKE) -C backend/python/rerankers $(MAKE) -C backend/python/rerankers
$(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/kokoro $(MAKE) -C backend/python/transformers-musicgen
$(MAKE) -C backend/python/parler-tts
$(MAKE) -C backend/python/vall-e-x
$(MAKE) -C backend/python/openvoice
$(MAKE) -C backend/python/exllama2 $(MAKE) -C backend/python/exllama2
prepare-test-extra: protogen-python prepare-test-extra: protogen-python
$(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/diffusers $(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/vllm
test-extra: prepare-test-extra test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test $(MAKE) -C backend/python/transformers test
$(MAKE) -C backend/python/diffusers test $(MAKE) -C backend/python/diffusers test
$(MAKE) -C backend/python/vllm test
backend-assets: backend-assets:
mkdir -p backend-assets mkdir -p backend-assets
@ -708,13 +791,6 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx512
$(MAKE) -C backend/cpp/llama-avx512 purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-avx cp -rf backend/cpp/llama backend/cpp/llama-avx
$(MAKE) -C backend/cpp/llama-avx purge $(MAKE) -C backend/cpp/llama-avx purge
@ -768,6 +844,13 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/ mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@ -782,6 +865,13 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/piper $(UPX) backend-assets/grpc/piper
endif endif
backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion
endif
backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1 backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \ CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
@ -789,8 +879,15 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/silero-vad $(UPX) backend-assets/grpc/silero-vad
endif endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \ CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/tinydream
endif
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
ifneq ($(UPX),) ifneq ($(UPX),)
$(UPX) backend-assets/grpc/whisper $(UPX) backend-assets/grpc/whisper
@ -842,8 +939,7 @@ docker-aio-all:
docker-image-intel: docker-image-intel:
docker build \ docker build \
--progress plain \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -851,7 +947,7 @@ docker-image-intel:
docker-image-intel-xpu: docker-image-intel-xpu:
docker build \ docker build \
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \ --build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \ --build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
--build-arg GO_TAGS="none" \ --build-arg GO_TAGS="none" \
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \ --build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@ -863,7 +959,7 @@ swagger:
.PHONY: gen-assets .PHONY: gen-assets
gen-assets: gen-assets:
$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets $(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
## Documentation ## Documentation
docs/layouts/_default: docs/layouts/_default:

184
README.md
View file

@ -1,6 +1,7 @@
<h1 align="center"> <h1 align="center">
<br> <br>
<img height="300" src="./core/http/static/logo.png"> <br> <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
LocalAI
<br> <br>
</h1> </h1>
@ -30,7 +31,7 @@
<p align="center"> <p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank"> <a href="https://twitter.com/LocalAI_API" target="blank">
<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/> <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
</a> </a>
<a href="https://discord.gg/uJAeKSAGDy" target="blank"> <a href="https://discord.gg/uJAeKSAGDy" target="blank">
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/> <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@ -38,159 +39,40 @@
</p> </p>
<p align="center"> <p align="center">
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/1484" target="_blank"><img src="https://trendshift.io/api/badge/repositories/1484" alt="go-skynet%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p> </p>
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/) > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
> >
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on > [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai) [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler). **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API thats compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
## 📚🆕 Local Stack Family
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
<table>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalAGI">
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
</td>
</tr>
<tr>
<td width="50%" valign="top">
<a href="https://github.com/mudler/LocalRecall">
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
</a>
</td>
<td width="50%" valign="top">
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
</td>
</tr>
</table>
## Screenshots
| Talk Interface | Generate Audio |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
| Models Overview | Generate Images |
| --- | --- |
| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
| Chat Interface | Home |
| --- | --- |
| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
| Login | Swarm |
| --- | --- |
|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
## 💻 Quickstart
Run the installer script: Run the installer script:
```bash ```bash
# Basic installation
curl https://localai.io/install.sh | sh curl https://localai.io/install.sh | sh
``` ```
For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
Or run with docker: Or run with docker:
### CPU only image:
```bash ```bash
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest # CPU only image:
``` docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
### NVIDIA GPU Images: # Nvidia GPU:
```bash
# CUDA 12.0 with core features
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
# CUDA 12.0 with extra Python dependencies # CPU and GPU image (bigger size):
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12-extras docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
# CUDA 11.7 with core features # AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
# CUDA 11.7 with extra Python dependencies
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11-extras
# NVIDIA Jetson (L4T) ARM64
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
```
### AMD GPU Images (ROCm):
```bash
# ROCm with core features
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
# ROCm with extra Python dependencies
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas-extras
```
### Intel GPU Images (oneAPI):
```bash
# Intel GPU with FP16 support
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
# Intel GPU with FP16 support and extra dependencies
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16-extras
# Intel GPU with FP32 support
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
# Intel GPU with FP32 support and extra dependencies
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32-extras
```
### Vulkan GPU Images:
```bash
# Vulkan with core features
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
```
### AIO Images (pre-downloaded models):
```bash
# CPU version
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
# NVIDIA CUDA 12 version
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
# NVIDIA CUDA 11 version
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
# Intel GPU version
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
# AMD GPU version
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
``` ```
For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
To load models: To load models:
```bash ```bash
@ -206,32 +88,48 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
local-ai run oci://localai/phi-2:latest local-ai run oci://localai/phi-2:latest
``` ```
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html) [💻 Getting started](https://localai.io/basics/getting_started/index.html)
## 📰 Latest project news ## 📰 Latest project news
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
- Apr 2025: WebUI overhaul, AIO images updates
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 ) - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 ) - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples) - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io) - Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
- May 2024: 🔥🔥 Decentralized P2P llama.cpp: https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs https://localai.io/features/distribute/ - May 2024: 🔥🔥 Decentralized P2P llama.cpp: https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs https://localai.io/features/distribute/
- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
- May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
## 🔥🔥 Hot topics (looking for help):
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
## 🚀 [Features](https://localai.io/features/) ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table)) - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/) - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`) - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation) - 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/)
- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/) - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/) - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@ -239,10 +137,11 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
- 🥽 [Vision API](https://localai.io/features/gpt-vision/) - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 📈 [Reranker API](https://localai.io/features/reranker/) - 📈 [Reranker API](https://localai.io/features/reranker/)
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/) - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
- 🔊 Voice activity detection (Silero-VAD support)
- 🌍 Integrated WebUI! - 🌍 Integrated WebUI!
## 💻 Usage
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
### 🔗 Community and integrations ### 🔗 Community and integrations
@ -260,7 +159,6 @@ Model galleries
Other: Other:
- Helm chart https://github.com/go-skynet/helm-charts - Helm chart https://github.com/go-skynet/helm-charts
- VSCode extension https://github.com/badgooooor/localai-vscode-plugin - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
- Terminal utility https://github.com/djcopley/ShellOracle - Terminal utility https://github.com/djcopley/ShellOracle
- Local Smart assistant https://github.com/mudler/LocalAGI - Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@ -318,7 +216,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
<p align="center"> <p align="center">
<a href="https://www.spectrocloud.com/" target="blank"> <a href="https://www.spectrocloud.com/" target="blank">
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962"> <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a> </a>
<a href="https://www.premai.io/" target="blank"> <a href="https://www.premai.io/" target="blank">
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br> <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
embeddings: true
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,17 +1,56 @@
name: stablediffusion name: stablediffusion
backend: stablediffusion-ggml backend: stablediffusion
cfg_scale: 4.5
options:
- sampler:euler
parameters: parameters:
model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf model: stablediffusion_assets
step: 25
license: "BSD-3"
urls:
- https://github.com/EdVince/Stable-Diffusion-NCNN
- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
description: |
Stable Diffusion in NCNN with c++, supported txt2img and img2img
download_files: download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf" - filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f" sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf" uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
- filename: "stablediffusion_assets/log_sigmas.bin"
sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
- filename: "stablediffusion_assets/vocab.txt"
sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
usage: | usage: |
curl http://localhost:8080/v1/images/generations \ curl http://localhost:8080/v1/images/generations \

View file

@ -1,57 +1,101 @@
context_size: 8192
f16: true
function:
grammar:
no_mixed_free_string: true
schema_type: llama3.1 # or JSON is supported too (json)
response_regex:
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- <|eot_id|> - "</tool_call>"
- <|end_of_text|> - "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|> {{.Input -}}
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|> <|im_start|>assistant
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: | chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|> <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
{{ else if eq .RoleName "tool" -}} <tool_call>
The Function was executed and the response was: {{- else if eq .RoleName "tool" }}
{{ end -}} <tool_response>
{{ if .Content -}} {{- end }}
{{.Content -}} {{- if .Content}}
{{ else if .FunctionCall -}} {{.Content }}
{{ range .FunctionCall }} {{- end }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})] {{- if .FunctionCall}}
{{ end }} {{toJson .FunctionCall}}
{{ end -}} {{- end }}
<|eot_id|> {{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|start_header_id|>system<|end_header_id|> <|im_start|>system
You are an expert in composing functions. You are given a question and a set of possible functions. You are a function calling AI model.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose. Here are the available tools:
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections. <tools>
If you decide to invoke any of the function(s), you MUST put it in the format as follows: {{range .Functions}}
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)] {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
You SHOULD NOT include any other text in the response. {{end}}
Here is a list of functions in JSON format that you can invoke. </tools>
{{toJson .Functions}} You should call the tools provided to you sequentially
<|eot_id|><|start_header_id|>user<|end_header_id|> Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
{{.Input}} <scratchpad>
<|eot_id|><|start_header_id|>assistant<|end_header_id|> {step-by-step reasoning and plan in bullet points}
</scratchpad>
download_files: For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf <tool_call>
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5 {"arguments": <args-dict>, "name": <function-name>}
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf </tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant

View file

@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View file

@ -1,49 +1,31 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: bakllava.gguf
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: bakllava.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -129,7 +129,7 @@ detect_gpu
detect_gpu_size detect_gpu_size
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}" export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
check_vars check_vars

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,53 +1,101 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: true
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
context_size: 8192
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf

View file

@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View file

@ -1,49 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true f16: true
mmap: true mmap: true
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -1,7 +1,7 @@
embeddings: true
name: text-embedding-ada-002 name: text-embedding-ada-002
backend: sentencetransformers
parameters: parameters:
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf model: all-MiniLM-L6-v2
usage: | usage: |
You can test this model with curl like this: You can test this model with curl like this:

View file

@ -1,53 +1,103 @@
context_size: 4096
f16: true
function:
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
grammar:
properties_order: name,arguments
json_regex_match:
- (?s)<Output>(.*?)</Output>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
mmap: true
name: gpt-4 name: gpt-4
mmap: false
context_size: 8192
f16: false
parameters: parameters:
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
stopwords: stopwords:
- <|im_end|> - "<|im_end|>"
- <dummy32000> - "<dummy32000>"
- </s> - "</tool_call>"
- "<|eot_id|>"
- "<|end_of_text|>"
function:
# disable injecting the "answer" tool
disable_no_action: true
grammar:
# This allows the grammar to also return messages
mixed_mode: true
# Suffix to add to the grammar
#prefix: '<tool_call>\n'
# Force parallel calls in the grammar
# parallel_calls: true
return_name_in_function_response: true
# Without grammar uncomment the lines below
# Warning: this is relying only on the capability of the
# LLM model to generate the correct function call.
json_regex_match:
- "(?s)<tool_call>(.*?)</tool_call>"
- "(?s)<tool_call>(.*?)"
replace_llm_results:
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
replace_function_results:
# Replace everything that is not JSON array or object
#
- key: '(?s)^[^{\[]*'
value: ""
- key: '(?s)[^}\]]*$'
value: ""
- key: "'([^']*?)'"
value: "_DQUOTE_${1}_DQUOTE_"
- key: '\\"'
value: "__TEMP_QUOTE__"
- key: "\'"
value: "'"
- key: "_DQUOTE_"
value: '"'
- key: "__TEMP_QUOTE__"
value: '"'
# Drop the scratchpad content from responses
- key: "(?s)<scratchpad>.*</scratchpad>"
value: ""
template: template:
chat: | chat: |
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
chat_message: | chat_message: |
<|im_start|>{{ .RoleName }} <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{ if .FunctionCall -}} {{- if .FunctionCall }}
Function call: <tool_call>
{{ else if eq .RoleName "tool" -}} {{- else if eq .RoleName "tool" }}
Function response: <tool_response>
{{ end -}} {{- end }}
{{ if .Content -}} {{- if .Content}}
{{.Content }} {{.Content }}
{{ end -}} {{- end }}
{{ if .FunctionCall -}} {{- if .FunctionCall}}
{{toJson .FunctionCall}} {{toJson .FunctionCall}}
{{ end -}}<|im_end|> {{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
completion: | completion: |
{{.Input}} {{.Input}}
function: | function: |-
<|im_start|>system <|im_start|>system
You are an AI assistant that executes function calls, and these are the tools at your disposal: You are a function calling AI model.
Here are the available tools:
<tools>
{{range .Functions}} {{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }} {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}} {{end}}
<|im_end|> </tools>
You should call the tools provided to you sequentially
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
<scratchpad>
{step-by-step reasoning and plan in bullet points}
</scratchpad>
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
<tool_call>
{"arguments": <args-dict>, "name": <function-name>}
</tool_call><|im_end|>
{{.Input -}} {{.Input -}}
<|im_start|>assistant <|im_start|>assistant
download_files:
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf

View file

@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View file

@ -1,50 +1,35 @@
backend: llama-cpp
context_size: 4096 context_size: 4096
f16: true mmap: false
mmap: true f16: false
mmproj: minicpm-v-2_6-mmproj-f16.gguf
name: gpt-4o name: gpt-4o
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters: parameters:
model: minicpm-v-2_6-Q4_K_M.gguf model: llava-v1.6-mistral-7b.Q5_K_M.gguf
stopwords: temperature: 0.2
- <|im_end|> top_k: 40
- <dummy32000> top_p: 0.95
- </s> seed: -1
- <|endoftext|>
template: template:
chat: | chat: |
{{.Input -}} A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}} {{.Input}}
function: | ASSISTANT:
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
download_files: download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf - filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1 uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf - filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "gpt-4-vision-preview",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View file

@ -1,15 +1,6 @@
package main package main
import ( import "embed"
rice "github.com/GeertJohan/go.rice"
)
var backendAssets *rice.Box //go:embed backend-assets/*
var backendAssets embed.FS
func init() {
var err error
backendAssets, err = rice.FindBox("backend-assets")
if err != nil {
panic(err)
}
}

View file

@ -14,7 +14,6 @@ service Backend {
rpc PredictStream(PredictOptions) returns (stream Reply) {} rpc PredictStream(PredictOptions) returns (stream Reply) {}
rpc Embedding(PredictOptions) returns (EmbeddingResult) {} rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
rpc GenerateImage(GenerateImageRequest) returns (Result) {} rpc GenerateImage(GenerateImageRequest) returns (Result) {}
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {} rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc TTS(TTSRequest) returns (Result) {} rpc TTS(TTSRequest) returns (Result) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {} rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@ -160,12 +159,6 @@ message Reply {
bytes message = 1; bytes message = 1;
int32 tokens = 2; int32 tokens = 2;
int32 prompt_tokens = 3; int32 prompt_tokens = 3;
double timing_prompt_processing = 4;
double timing_token_generation = 5;
}
message GrammarTrigger {
string word = 1;
} }
message ModelOptions { message ModelOptions {
@ -191,7 +184,11 @@ message ModelOptions {
int32 NGQA = 20; int32 NGQA = 20;
string ModelFile = 21; string ModelFile = 21;
// AutoGPTQ
string Device = 22;
bool UseTriton = 23;
string ModelBaseName = 24;
bool UseFastTokenizer = 25;
// Diffusers // Diffusers
string PipelineType = 26; string PipelineType = 26;
@ -225,11 +222,6 @@ message ModelOptions {
int32 MaxModelLen = 54; int32 MaxModelLen = 54;
int32 TensorParallelSize = 55; int32 TensorParallelSize = 55;
string LoadFormat = 58; string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;
string MMProj = 41; string MMProj = 41;
@ -250,11 +242,6 @@ message ModelOptions {
repeated float LoraScales = 61; repeated float LoraScales = 61;
repeated string Options = 62; repeated string Options = 62;
string CacheTypeKey = 63;
string CacheTypeValue = 64;
repeated GrammarTrigger GrammarTriggers = 65;
} }
message Result { message Result {
@ -302,19 +289,6 @@ message GenerateImageRequest {
int32 CLIPSkip = 11; int32 CLIPSkip = 11;
} }
message GenerateVideoRequest {
string prompt = 1;
string start_image = 2; // Path or base64 encoded image for the start frame
string end_image = 3; // Path or base64 encoded image for the end frame
int32 width = 4;
int32 height = 5;
int32 num_frames = 6; // Number of frames to generate
int32 fps = 7; // Frames per second
int32 seed = 8;
float cfg_scale = 9; // Classifier-free guidance scale
string dst = 10; // Output path for the generated video
}
message TTSRequest { message TTSRequest {
string text = 1; string text = 1;
string model = 2; string model = 2;
@ -371,4 +345,4 @@ message StatusResponse {
message Message { message Message {
string role = 1; string role = 1;
string content = 2; string content = 2;
} }

View file

@ -1,17 +1,17 @@
## XXX: In some versions of CMake clip wasn't being built before llama. ## XXX: In some versions of CMake clip wasn't being built before llama.
## This is an hack for now, but it should be fixed in the future. ## This is an hack for now, but it should be fixed in the future.
# set(TARGET myclip) set(TARGET myclip)
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h) add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
# install(TARGETS ${TARGET} LIBRARY) install(TARGETS ${TARGET} LIBRARY)
# target_include_directories(myclip PUBLIC .) target_include_directories(myclip PUBLIC .)
# target_include_directories(myclip PUBLIC ../..) target_include_directories(myclip PUBLIC ../..)
# target_include_directories(myclip PUBLIC ../../common) target_include_directories(myclip PUBLIC ../../common)
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
# target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
# if (NOT MSVC) if (NOT MSVC)
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
# endif() endif()
# END CLIP hack # END CLIP hack
@ -74,12 +74,8 @@ add_library(hw_grpc_proto
${hw_proto_srcs} ${hw_proto_srcs}
${hw_proto_hdrs} ) ${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h) add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
absl::flags_parse absl::flags_parse
gRPC::${_REFLECTION} gRPC::${_REFLECTION}
gRPC::${_GRPC_GRPCPP} gRPC::${_GRPC_GRPCPP}

View file

@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
TARGET?=--target grpc-server TARGET?=--target grpc-server
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas) ifeq ($(BUILD_TYPE),cublas)
@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
endif endif
ifeq ($(BUILD_TYPE),sycl_f16) ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl" \
-DGGML_SYCL_F16=ON
endif endif
ifeq ($(BUILD_TYPE),sycl_f32) ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \ CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DCMAKE_CXX_FLAGS="-fsycl"
endif endif
llama.cpp: llama.cpp:
@ -59,8 +52,8 @@ llama.cpp:
git checkout -b build $(LLAMA_VERSION) && \ git checkout -b build $(LLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch git submodule update --init --recursive --depth 1 --single-branch
llama.cpp/tools/grpc-server: llama.cpp llama.cpp/examples/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server mkdir -p llama.cpp/examples/grpc-server
bash prepare.sh bash prepare.sh
rebuild: rebuild:
@ -70,13 +63,13 @@ rebuild:
purge: purge:
rm -rf llama.cpp/build rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server rm -rf llama.cpp/examples/grpc-server
rm -rf grpc-server rm -rf grpc-server
clean: purge clean: purge
rm -rf llama.cpp rm -rf llama.cpp
grpc-server: llama.cpp llama.cpp/tools/grpc-server grpc-server: llama.cpp llama.cpp/examples/grpc-server
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)" @echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE))) ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \ +bash -c "source $(ONEAPI_VARS); \
@ -84,4 +77,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
else else
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET) +cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
endif endif
cp llama.cpp/build/bin/grpc-server . cp llama.cpp/build/bin/grpc-server .

File diff suppressed because it is too large Load diff

24596
backend/cpp/llama/json.hpp vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,13 +1,13 @@
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd0d2fa..6c5e811a 100644 index 342042ff..224db9b5 100644
--- a/tools/mtmd/clip.cpp --- a/examples/llava/clip.cpp
+++ b/tools/mtmd/clip.cpp +++ b/examples/llava/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima @@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches)); int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) { for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1; - patches_data[i] = i + 1;
+ patches_data[i] = i; + patches_data[i] = i;
} }
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data); free(patches_data);

View file

@ -7,46 +7,21 @@ for patch in $(ls patches); do
patch -d llama.cpp/ -p1 < patches/$patch patch -d llama.cpp/ -p1 < patches/$patch
done done
set -e cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ cp -rfv json.hpp llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ cp -rfv utils.hpp llama.cpp/examples/grpc-server/
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/ if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
set +e
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
echo "grpc-server already added" echo "grpc-server already added"
else else
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
fi fi
set -e
# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes ## XXX: In some versions of CMake clip wasn't being built before llama.
# and remove the main function ## This is an hack for now, but it should be fixed in the future.
# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
awk ' cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
/int[ \t]+main[ \t]*\(/ { # If the line starts the main function echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
in_main=1; # Set a flag cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
open_braces=0; # Track number of open braces cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
}
in_main {
open_braces += gsub(/\{/, "{"); # Count opening braces
open_braces -= gsub(/\}/, "}"); # Count closing braces
if (open_braces == 0) { # If all braces are closed
in_main=0; # End skipping
}
next; # Skip lines inside main
}
!in_main # Print lines not inside main
' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
# remove index.html.gz.hpp and loading.html.hpp includes
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
else
# Linux and others
sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
fi

483
backend/cpp/llama/utils.hpp vendored Normal file
View file

@ -0,0 +1,483 @@
// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
#pragma once
#include <string>
#include <vector>
#include <set>
#include <mutex>
#include <condition_variable>
#include <unordered_map>
#include "json.hpp"
#include "../llava/clip.h"
using json = nlohmann::json;
extern bool server_verbose;
#ifndef SERVER_VERBOSE
#define SERVER_VERBOSE 1
#endif
#if SERVER_VERBOSE != 1
#define LOG_VERBOSE(MSG, ...)
#else
#define LOG_VERBOSE(MSG, ...) \
do \
{ \
if (server_verbose) \
{ \
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
} \
} while (0)
#endif
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
//
// parallel
//
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum task_type {
TASK_TYPE_COMPLETION,
TASK_TYPE_CANCEL,
TASK_TYPE_NEXT_RESPONSE
};
struct task_server {
int id = -1; // to be filled by llama_server_queue
int target_id;
task_type type;
json data;
bool infill_mode = false;
bool embedding_mode = false;
int multitask_id = -1;
};
struct task_result {
int id;
int multitask_id = -1;
bool stop;
bool error;
json result_json;
};
struct task_multi {
int id;
std::set<int> subtasks_remaining{};
std::vector<task_result> results{};
};
// TODO: can become bool if we can't find use of more states
enum slot_state
{
IDLE,
PROCESSING,
};
enum slot_command
{
NONE,
LOAD_PROMPT,
RELEASE,
};
struct slot_params
{
bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
uint32_t seed = -1; // RNG seed
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_predict = -1; // new tokens to predict
std::vector<std::string> antiprompt;
json input_prefix;
json input_suffix;
};
struct slot_image
{
int32_t id;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
};
// completion token output with probabilities
struct completion_token_output
{
struct token_prob
{
llama_token tok;
float prob;
};
std::vector<token_prob> probs;
llama_token tok;
std::string text_to_send;
};
static inline void server_log(const char *level, const char *function, int line,
const char *message, const nlohmann::ordered_json &extra)
{
nlohmann::ordered_json log
{
{"timestamp", time(nullptr)},
{"level", level},
{"function", function},
{"line", line},
{"message", message},
};
if (!extra.empty())
{
log.merge_patch(extra);
}
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout);
}
//
// server utils
//
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value)
{
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
inline std::string format_chatml(std::vector<json> messages)
{
std::ostringstream chatml_msgs;
for (auto it = messages.begin(); it != messages.end(); ++it) {
chatml_msgs << "<|im_start|>"
<< json_value(*it, "role", std::string("user")) << '\n';
chatml_msgs << json_value(*it, "content", std::string(""))
<< "<|im_end|>\n";
}
chatml_msgs << "<|im_start|>assistant" << '\n';
return chatml_msgs.str();
}
//
// work queue utils
//
struct llama_server_queue {
int id = 0;
std::mutex mutex_tasks;
// queues
std::vector<task_server> queue_tasks;
std::vector<task_server> queue_tasks_deferred;
std::vector<task_multi> queue_multitasks;
std::condition_variable condition_tasks;
// callback functions
std::function<void(task_server&)> callback_new_task;
std::function<void(task_multi&)> callback_finish_multitask;
std::function<void(void)> callback_all_task_finished;
// Add a new task to the end of the queue
int post(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
}
queue_tasks.push_back(std::move(task));
condition_tasks.notify_one();
return task.id;
}
// Add a new task, but defer until one slot is available
void defer(task_server task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
queue_tasks_deferred.push_back(std::move(task));
}
// Get the next id for creating anew task
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
return id++;
}
// Register function to process a new task
void on_new_task(std::function<void(task_server&)> callback) {
callback_new_task = callback;
}
// Register function to process a multitask
void on_finish_multitask(std::function<void(task_multi&)> callback) {
callback_finish_multitask = callback;
}
// Register the function to be called when the batch of tasks is finished
void on_all_tasks_finished(std::function<void(void)> callback) {
callback_all_task_finished = callback;
}
// Call when the state of one slot is changed
void notify_slot_changed() {
// move deferred tasks back to main loop
std::unique_lock<std::mutex> lock(mutex_tasks);
for (auto & task : queue_tasks_deferred) {
queue_tasks.push_back(std::move(task));
}
queue_tasks_deferred.clear();
}
// Start the main loop. This call is blocking
[[noreturn]]
void start_loop() {
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
{
while (true)
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
lock.unlock();
break;
}
task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin());
lock.unlock();
LOG_VERBOSE("callback_new_task", {});
callback_new_task(task);
}
LOG_VERBOSE("callback_all_task_finished", {});
// process and update all the multitasks
auto queue_iterator = queue_multitasks.begin();
while (queue_iterator != queue_multitasks.end())
{
if (queue_iterator->subtasks_remaining.empty())
{
// all subtasks done == multitask is done
task_multi current_multitask = *queue_iterator;
callback_finish_multitask(current_multitask);
// remove this multitask
queue_iterator = queue_multitasks.erase(queue_iterator);
}
else
{
++queue_iterator;
}
}
// all tasks in the current loop is finished
callback_all_task_finished();
}
LOG_VERBOSE("wait for new task", {});
// wait for new task
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
condition_tasks.wait(lock, [&]{
return !queue_tasks.empty();
});
}
}
}
}
//
// functions to manage multitasks
//
// add a multitask by specifying the id of all subtask (subtask is a task_server)
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
task_multi multi;
multi.id = multitask_id;
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
queue_multitasks.push_back(multi);
}
// updatethe remaining subtasks, while appending results to multitask
void update_multitask(int multitask_id, int subtask_id, task_result& result)
{
std::lock_guard<std::mutex> lock(mutex_tasks);
for (auto& multitask : queue_multitasks)
{
if (multitask.id == multitask_id)
{
multitask.subtasks_remaining.erase(subtask_id);
multitask.results.push_back(result);
}
}
}
};
struct llama_server_response {
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
callback_multitask_t callback_update_multitask;
// for keeping track of all tasks waiting for the result
std::set<int> waiting_task_ids;
// the main result queue
std::vector<task_result> queue_results;
std::mutex mutex_results;
std::condition_variable condition_results;
void add_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
void remove_waiting_task_id(int task_id) {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
}
// This function blocks the thread until there is a response for this task_id
task_result recv(int task_id) {
while (true)
{
std::unique_lock<std::mutex> lock(mutex_results);
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
LOG_VERBOSE("condition_results unblock", {});
for (int i = 0; i < (int) queue_results.size(); i++)
{
if (queue_results[i].id == task_id)
{
assert(queue_results[i].multitask_id == -1);
task_result res = queue_results[i];
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// Register the function to update multitask
void on_multitask_update(callback_multitask_t callback) {
callback_update_multitask = callback;
}
// Send a new result to a waiting task_id
void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {});
for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id)
{
LOG_VERBOSE("callback_update_multitask", {});
callback_update_multitask(task_id, result.id, result);
continue;
}
if (result.id == task_id)
{
LOG_VERBOSE("queue_results.push_back", {});
queue_results.push_back(result);
condition_results.notify_one();
return;
}
}
}
};
//
// base64 utils (TODO: move to common in the future)
//
static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789+/";
static inline bool is_base64(uint8_t c)
{
return (isalnum(c) || (c == '+') || (c == '/'));
}
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
{
int i = 0;
int j = 0;
int in_ = 0;
int in_len = encoded_string.size();
uint8_t char_array_4[4];
uint8_t char_array_3[3];
std::vector<uint8_t> ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
{
char_array_4[i++] = encoded_string[in_]; in_++;
if (i == 4)
{
for (i = 0; i <4; i++)
{
char_array_4[i] = base64_chars.find(char_array_4[i]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++)
{
ret.push_back(char_array_3[i]);
}
i = 0;
}
}
if (i)
{
for (j = i; j <4; j++)
{
char_array_4[j] = 0;
}
for (j = 0; j <4; j++)
{
char_array_4[j] = base64_chars.find(char_array_4[j]);
}
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++)
{
ret.push_back(char_array_3[j]);
}
}
return ret;
}

View file

@ -48,7 +48,7 @@ int tts(char *text,int threads, char *dst ) {
// generate audio // generate audio
if (!bark_generate_audio(c, text, threads)) { if (!bark_generate_audio(c, text, threads)) {
fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__); fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
return 1; return 1;
} }

View file

@ -2,134 +2,20 @@ INCLUDE_PATH := $(abspath ./)
LIBRARY_PATH := $(abspath ./) LIBRARY_PATH := $(abspath ./)
AR?=ar AR?=ar
CMAKE_ARGS?=
BUILD_TYPE?= BUILD_TYPE?=
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
# keep standard at C11 and C++11 # keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
GOCMD?=go
CGO_LDFLAGS?=
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
CGO_LDFLAGS_SYCL=
GO_TAGS?=
LD_FLAGS?=
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DSD_CUDA=ON
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# to CMAKE_ARGS automatically
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
else ifeq ($(BUILD_TYPE),clblas)
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DSD_HIPBLAS=ON
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
# But if it's OSX without metal, disable it here
else ifeq ($(OS),Darwin)
ifneq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DSD_METAL=OFF
else
CMAKE_ARGS+=-DSD_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
TARGET+=--target ggml-metal
endif
endif
ifeq ($(BUILD_TYPE),sycl_f16)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON \
-DGGML_SYCL_F16=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
ifeq ($(BUILD_TYPE),sycl_f32)
CMAKE_ARGS+=-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DSD_SYCL=ON
CC=icx
CXX=icpx
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
endif
# warnings # warnings
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
# Find all .a archives in ARCHIVE_DIR
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
GGML_ARCHIVE_DIR := build/ggml/src/
ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
# Name of the single merged library
COMBINED_LIB := libggmlall.a
# Rule to merge all the .a files into one
$(COMBINED_LIB): $(ALL_ARCHIVES)
@echo "Merging all .a into $(COMBINED_LIB)"
rm -f $@
mkdir -p merge-tmp
for a in $(ALL_ARCHIVES); do \
( cd merge-tmp && ar x ../$$a ); \
done
( cd merge-tmp && ar rcs ../$@ *.o )
# Ensure we have a proper index
ranlib $@
# Clean up
rm -rf merge-tmp
build/libstable-diffusion.a:
@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
cmake --build . --config Release"
else
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
cmake --build . --config Release
endif
$(MAKE) $(COMBINED_LIB)
gosd.o: gosd.o:
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
else
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c $(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
endif
libsd.a: gosd.o libsd.a: gosd.o
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
$(AR) rcs libsd.a gosd.o $(AR) rcs libsd.a gosd.o
stablediffusion-ggml:
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
ifneq ($(UPX),)
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
endif
clean: clean:
rm -rf gosd.o libsd.a build $(COMBINED_LIB) rm -f gosd.o libsd.a

View file

@ -35,8 +35,6 @@ const char* sample_method_str[] = {
"ipndm", "ipndm",
"ipndm_v", "ipndm_v",
"lcm", "lcm",
"ddim_trailing",
"tcd",
}; };
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@ -175,7 +173,6 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
-1, //clip_skip -1, //clip_skip
cfg_scale, // sfg_scale cfg_scale, // sfg_scale
3.5f, 3.5f,
0, // eta
width, width,
height, height,
sample_method, sample_method,

View file

@ -1,7 +1,7 @@
package main package main
// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp // #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
// #include <gosd.h> // #include <gosd.h>
// #include <stdlib.h> // #include <stdlib.h>
import "C" import "C"

View file

@ -0,0 +1,21 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Image{}); err != nil {
panic(err)
}
}

View file

@ -0,0 +1,33 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/stablediffusion"
)
type Image struct {
base.SingleThread
stablediffusion *stablediffusion.StableDiffusion
}
func (image *Image) Load(opts *pb.ModelOptions) error {
var err error
// Note: the Model here is a path to a directory containing the model files
image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
return err
}
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
return image.stablediffusion.GenerateImage(
int(opts.Height),
int(opts.Width),
int(opts.Mode),
int(opts.Step),
int(opts.Seed),
opts.PositivePrompt,
opts.NegativePrompt,
opts.Dst)
}

View file

@ -0,0 +1,21 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Image{}); err != nil {
panic(err)
}
}

View file

@ -0,0 +1,32 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/tinydream"
)
type Image struct {
base.SingleThread
tinydream *tinydream.TinyDream
}
func (image *Image) Load(opts *pb.ModelOptions) error {
var err error
// Note: the Model here is a path to a directory containing the model files
image.tinydream, err = tinydream.New(opts.ModelFile)
return err
}
func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
return image.tinydream.GenerateImage(
int(opts.Height),
int(opts.Width),
int(opts.Step),
int(opts.Seed),
opts.PositivePrompt,
opts.NegativePrompt,
opts.Dst)
}

View file

@ -0,0 +1,204 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View file

@ -0,0 +1,19 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View file

@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
} }
func isNormalized(k []float32) bool { func isNormalized(k []float32) bool {
var sum float64 var sum float32
for _, v := range k { for _, v := range k {
v64 := float64(v) sum += v
sum += v64*v64
} }
s := math.Sqrt(sum) return sum == 1.0
return s >= 0.99 && s <= 1.01
} }
// TODO: This we could replace with handwritten SIMD code // TODO: This we could replace with handwritten SIMD code
@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
dot += k1[i] * k2[i] dot += k1[i] * k2[i]
} }
assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot)) assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
// 2.0 * (1.0 - dot) would be the Euclidean distance // 2.0 * (1.0 - dot) would be the Euclidean distance
return dot return dot
@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
sim := float32(dot / (mag1 * math.Sqrt(mag2))) sim := float32(dot / (mag1 * math.Sqrt(mag2)))
assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim)) assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
return sim return sim
} }

View file

@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
context.SetTranslate(true) context.SetTranslate(true)
} }
if err := context.Process(data, nil, nil, nil); err != nil { if err := context.Process(data, nil, nil); err != nil {
return pb.TranscriptResult{}, err return pb.TranscriptResult{}, err
} }

View file

@ -1,9 +1,6 @@
.DEFAULT_GOAL := install .PHONY: autogptq
autogptq: protogen
.PHONY: install
install:
bash install.sh bash install.sh
$(MAKE) protogen
.PHONY: protogen .PHONY: protogen
protogen: backend_pb2_grpc.py backend_pb2.py protogen: backend_pb2_grpc.py backend_pb2.py
@ -13,7 +10,7 @@ protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py $(RM) backend_pb2_grpc.py backend_pb2.py
backend_pb2_grpc.py backend_pb2.py: backend_pb2_grpc.py backend_pb2.py:
bash protogen.sh python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
.PHONY: clean .PHONY: clean
clean: protogen-clean clean: protogen-clean

View file

@ -0,0 +1,5 @@
# Creating a separate environment for the autogptq project
```
make autogptq
```

View file

@ -0,0 +1,153 @@
#!/usr/bin/env python3
from concurrent import futures
import argparse
import signal
import sys
import os
import time
import base64
import grpc
import backend_pb2
import backend_pb2_grpc
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextGenerationPipeline
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
# Implement the BackendServicer class with the service methods
class BackendServicer(backend_pb2_grpc.BackendServicer):
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
try:
device = "cuda:0"
if request.Device != "":
device = request.Device
# support loading local model files
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
# support model `Qwen/Qwen-VL-Chat-Int4`
if "qwen-vl" in request.Model.lower():
self.model_name = "Qwen-VL-Chat"
model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=request.TrustRemoteCode,
device_map="auto").eval()
else:
model = AutoGPTQForCausalLM.from_quantized(model_path,
model_basename=request.ModelBaseName,
use_safetensors=True,
trust_remote_code=request.TrustRemoteCode,
device=device,
use_triton=request.UseTriton,
quantize_config=None)
self.model = model
self.tokenizer = tokenizer
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def Predict(self, request, context):
penalty = 1.0
if request.Penalty != 0.0:
penalty = request.Penalty
tokens = 512
if request.Tokens != 0:
tokens = request.Tokens
top_p = 0.95
if request.TopP != 0.0:
top_p = request.TopP
prompt_images = self.recompile_vl_prompt(request)
compiled_prompt = prompt_images[0]
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
# Implement Predict RPC
pipeline = TextGenerationPipeline(
model=self.model,
tokenizer=self.tokenizer,
max_new_tokens=tokens,
temperature=request.Temperature,
top_p=top_p,
repetition_penalty=penalty,
)
t = pipeline(compiled_prompt)[0]["generated_text"]
print(f"generated_text: {t}", file=sys.stderr)
if compiled_prompt in t:
t = t.replace(compiled_prompt, "")
# house keeping. Remove the image files from /tmp folder
for img_path in prompt_images[1]:
try:
os.remove(img_path)
except Exception as e:
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
def PredictStream(self, request, context):
# Implement PredictStream RPC
#for reply in some_data_generator():
# yield reply
# Not implemented yet
return self.Predict(request, context)
def recompile_vl_prompt(self, request):
prompt = request.Prompt
image_paths = []
if "qwen-vl" in self.model_name.lower():
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
# Then, save the image file paths to an array "image_paths".
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
for i, img in enumerate(request.Images):
timestamp = str(int(time.time() * 1000)) # Generate timestamp
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
with open(img_path, "wb") as f:
f.write(base64.b64decode(img))
image_paths.append(img_path)
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
else:
prompt = request.Prompt
return (prompt, image_paths)
def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
# Define the signal handler function
def signal_handler(sig, frame):
print("Received termination signal. Shutting down...")
server.stop(0)
sys.exit(0)
# Set the signal handlers for SIGINT and SIGTERM
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
try:
while True:
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the gRPC server.")
parser.add_argument(
"--addr", default="localhost:50051", help="The address to bind the server to."
)
args = parser.parse_args()
serve(args.addr)

View file

@ -0,0 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118

View file

@ -0,0 +1 @@
torch==2.4.1

View file

@ -1,3 +1,2 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0 --extra-index-url https://download.pytorch.org/whl/rocm6.0
torch torch==2.4.1+rocm6.0
faster-whisper

View file

@ -0,0 +1,5 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch
torch
optimum[openvino]
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406

View file

@ -0,0 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.68.1
protobuf
certifi
transformers

View file

@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

View file

@ -1,9 +1,8 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu intel-extension-for-pytorch
torch==2.3.1+cxx11.abi torch
torchaudio==2.3.1+cxx11.abi torchaudio
oneccl_bind_pt==2.3.100+xpu
optimum[openvino] optimum[openvino]
setuptools setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
transformers transformers
accelerate accelerate

View file

@ -1,4 +1,4 @@
bark==0.1.5 bark==0.1.5
grpcio==1.72.0 grpcio==1.68.1
protobuf protobuf
certifi certifi

View file

@ -17,9 +17,6 @@
# LIMIT_TARGETS="cublas12" # LIMIT_TARGETS="cublas12"
# source $(dirname $0)/../common/libbackend.sh # source $(dirname $0)/../common/libbackend.sh
# #
PYTHON_VERSION="3.10"
function init() { function init() {
# Name of the backend (directory name) # Name of the backend (directory name)
BACKEND_NAME=${PWD##*/} BACKEND_NAME=${PWD##*/}
@ -91,7 +88,7 @@ function getBuildProfile() {
# always result in an activated virtual environment # always result in an activated virtual environment
function ensureVenv() { function ensureVenv() {
if [ ! -d "${EDIR}/venv" ]; then if [ ! -d "${EDIR}/venv" ]; then
uv venv --python ${PYTHON_VERSION} ${EDIR}/venv uv venv ${EDIR}/venv
echo "virtualenv created" echo "virtualenv created"
fi fi

View file

@ -1,5 +1,4 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
intel-extension-for-pytorch==2.3.110+xpu intel-extension-for-pytorch
torch==2.3.1+cxx11.abi torch
oneccl_bind_pt==2.3.100+xpu
optimum[openvino] optimum[openvino]

View file

@ -1,3 +1,3 @@
grpcio==1.72.0 grpcio==1.68.1
protobuf protobuf
grpcio-tools grpcio-tools

View file

@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True) return backend_pb2.Result(success=True)
def serve(address): def serve(address):
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
options=[
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address) server.add_insecure_port(address)
server.start() server.start()

Some files were not shown because too many files have changed in this diff Show more