mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
Compare commits
473 commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
04a3d8e5ac | ||
![]() |
9af09b3f8c | ||
![]() |
0d590a4044 | ||
![]() |
e0a54de4f5 | ||
![]() |
6bc2ae5467 | ||
![]() |
8caaf49f5d | ||
![]() |
1db51044bb | ||
![]() |
ec21b58008 | ||
![]() |
996259b529 | ||
![]() |
f2942cc0e1 | ||
![]() |
f8fbfd4fa3 | ||
![]() |
41e239c67e | ||
![]() |
587827e779 | ||
![]() |
456b4982ef | ||
![]() |
159388cce8 | ||
![]() |
cfc73c7773 | ||
![]() |
6d5bde860b | ||
![]() |
6ef383033b | ||
![]() |
cd494089d9 | ||
![]() |
3033845f94 | ||
![]() |
0f365ac204 | ||
![]() |
525cf198be | ||
![]() |
658c2a4f55 | ||
![]() |
c987de090d | ||
![]() |
04365843e6 | ||
![]() |
1dc5781679 | ||
![]() |
30704292de | ||
![]() |
e52c66c76e | ||
![]() |
cb28aef93b | ||
![]() |
029f97c2a2 | ||
![]() |
3be71be696 | ||
![]() |
6adb019f8f | ||
![]() |
fcaa0a2f01 | ||
![]() |
fd17a3312c | ||
![]() |
12d0fe610b | ||
![]() |
11c67d16b8 | ||
![]() |
63f7c86c4d | ||
![]() |
ac89bf77bf | ||
![]() |
0395cc02fb | ||
![]() |
616972fca0 | ||
![]() |
942fbff62d | ||
![]() |
2612a0c910 | ||
![]() |
2dcb6d7247 | ||
![]() |
6978eec69f | ||
![]() |
2fcfe54466 | ||
![]() |
4e7506a3be | ||
![]() |
2a46217f90 | ||
![]() |
31ff9dbd52 | ||
![]() |
9483abef03 | ||
![]() |
ce3e8b3e31 | ||
![]() |
f3bb84c9a7 | ||
![]() |
ecb1297582 | ||
![]() |
73fc702b3c | ||
![]() |
e3af62ae1a | ||
![]() |
dc21604741 | ||
![]() |
5433f1a70e | ||
![]() |
d5e032bdcd | ||
![]() |
de786f6586 | ||
![]() |
8b9bc4aa6e | ||
![]() |
e6cea7d28e | ||
![]() |
7d7d56f2ce | ||
![]() |
1caae91ab6 | ||
![]() |
e90f2cb0ca | ||
![]() |
5a4291fadd | ||
![]() |
91ef58ee5a | ||
![]() |
a86e8c78f1 | ||
![]() |
adb24214c6 | ||
![]() |
f03a0430aa | ||
![]() |
73bc12abc0 | ||
![]() |
7fa437bbcc | ||
![]() |
4a27c99928 | ||
![]() |
6ce94834b6 | ||
![]() |
84a26458dc | ||
![]() |
7aa377b6a9 | ||
![]() |
64e66dda4a | ||
![]() |
a085f61fdc | ||
![]() |
21bdfe5fa4 | ||
![]() |
7ebd7b2454 | ||
![]() |
6984749ea1 | ||
![]() |
c0a206bc7a | ||
![]() |
01bbb31fb3 | ||
![]() |
72111c597d | ||
![]() |
b2f9fc870b | ||
![]() |
1fc6d469ac | ||
![]() |
05848b2027 | ||
![]() |
1da0644aa3 | ||
![]() |
c087cd1377 | ||
![]() |
c621412f6a | ||
![]() |
5a8b1892cd | ||
![]() |
5b20426863 | ||
![]() |
5c6cd50ed6 | ||
![]() |
bace6516f1 | ||
![]() |
3baadf6f27 | ||
![]() |
8804c701b8 | ||
![]() |
7b3ceb19bb | ||
![]() |
e7f3effea1 | ||
![]() |
61694a2ffb | ||
![]() |
573a3f104c | ||
![]() |
0e8af53a5b | ||
![]() |
960ffa808c | ||
![]() |
92719568e5 | ||
![]() |
163939af71 | ||
![]() |
399f1241dc | ||
![]() |
58c9ade2e8 | ||
![]() |
6e1c93d84f | ||
![]() |
4076ea0494 | ||
![]() |
26cbf77c0d | ||
![]() |
640790d628 | ||
![]() |
4132adea2f | ||
![]() |
2b2d907a3a | ||
![]() |
6e8f4f584b | ||
![]() |
662cfc2b48 | ||
![]() |
a25d355d66 | ||
![]() |
6d1cfdbefc | ||
![]() |
5ecc478968 | ||
![]() |
aef5c4291b | ||
![]() |
c059f912b9 | ||
![]() |
bc1e059259 | ||
![]() |
38dc07793a | ||
![]() |
da6ef0967d | ||
![]() |
7a011e60bd | ||
![]() |
e13dd5b09f | ||
![]() |
86ee303bd6 | ||
![]() |
978ee96fd3 | ||
![]() |
3ad5691db6 | ||
![]() |
0027681090 | ||
![]() |
8cba990edc | ||
![]() |
88857696d4 | ||
![]() |
23f347e687 | ||
![]() |
b6e3dc5f02 | ||
![]() |
69667521e2 | ||
![]() |
2a92effc5d | ||
![]() |
a65e012aa2 | ||
![]() |
8e9b41d05f | ||
![]() |
078da5c2f0 | ||
![]() |
c5af5d139c | ||
![]() |
2c9279a542 | ||
![]() |
a67d22f5f2 | ||
![]() |
dc7c51dcc7 | ||
![]() |
98df65c7aa | ||
![]() |
1559b6b522 | ||
![]() |
a0244e3fb4 | ||
![]() |
d66396201a | ||
![]() |
9628860c0e | ||
![]() |
cae9bf1308 | ||
![]() |
5bb5da0760 | ||
![]() |
867973a850 | ||
![]() |
701cd6b6d5 | ||
![]() |
7f61d397d5 | ||
![]() |
1ae0b896fa | ||
![]() |
3937407cb3 | ||
![]() |
0e34ae4f3f | ||
![]() |
a38b99ecb6 | ||
![]() |
a4a4358182 | ||
![]() |
4bc39c2db3 | ||
![]() |
cc3df759f8 | ||
![]() |
378161060c | ||
![]() |
f2f788fe60 | ||
![]() |
9fa8ed6b1e | ||
![]() |
7fc37c5e29 | ||
![]() |
4bc4b1e8bc | ||
![]() |
e495b89f18 | ||
![]() |
ba09eaea1b | ||
![]() |
61cc76c455 | ||
![]() |
8abecb4a18 | ||
![]() |
8b3f76d8e6 | ||
![]() |
4e0497f1a6 | ||
![]() |
ba88c9f451 | ||
![]() |
a598285825 | ||
![]() |
cb7a172897 | ||
![]() |
771be28dfb | ||
![]() |
7d6b3eb42d | ||
![]() |
0bb33fab55 | ||
![]() |
e3bf7f77f7 | ||
![]() |
bd1707d339 | ||
![]() |
0474804541 | ||
![]() |
72693b3917 | ||
![]() |
a03b70010f | ||
![]() |
e3717e5c1a | ||
![]() |
c8f6858218 | ||
![]() |
06d7cc43ae | ||
![]() |
f2147cb850 | ||
![]() |
75bb9f4c28 | ||
![]() |
a2ef4b1e07 | ||
![]() |
161c9fe2db | ||
![]() |
7547463f81 | ||
![]() |
32e4dfd47b | ||
![]() |
f67e5dec68 | ||
![]() |
297d54acea | ||
![]() |
56f44d448c | ||
![]() |
0f0fafacd9 | ||
![]() |
4f239bac89 | ||
![]() |
04d74ac648 | ||
![]() |
18c3dc33ee | ||
![]() |
508cfa7369 | ||
![]() |
1f94cddbae | ||
![]() |
21ae7b4cd4 | ||
![]() |
bef22ab547 | ||
![]() |
eb04e8cdcf | ||
![]() |
17e533a086 | ||
![]() |
4fc68409ff | ||
![]() |
e587044449 | ||
![]() |
1f09db5161 | ||
![]() |
05b744f086 | ||
![]() |
89ca4bc02d | ||
![]() |
e626aa48a4 | ||
![]() |
752b5e0339 | ||
![]() |
637d72d6e3 | ||
![]() |
f3bfec580a | ||
![]() |
165c1ddff3 | ||
![]() |
fb83238e9e | ||
![]() |
700bfa41c7 | ||
![]() |
25bdc350df | ||
![]() |
1b899e1a68 | ||
![]() |
3bf13f8c69 | ||
![]() |
7a00729374 | ||
![]() |
d484028532 | ||
![]() |
0eb7fc2c41 | ||
![]() |
a69e30e0c9 | ||
![]() |
9c018e6bff | ||
![]() |
281e818047 | ||
![]() |
270f0e2157 | ||
![]() |
673e59e76c | ||
![]() |
5a8a2adb44 | ||
![]() |
a7317d23bf | ||
![]() |
2bab9b5fe2 | ||
![]() |
081be3ba7d | ||
![]() |
25e6f21322 | ||
![]() |
b4df1c9cf3 | ||
![]() |
4fbd6609f2 | ||
![]() |
7387932f89 | ||
![]() |
59c37e67b2 | ||
![]() |
c09d227647 | ||
![]() |
547d322b28 | ||
![]() |
a6f0bb410f | ||
![]() |
710f624ecd | ||
![]() |
5018452be7 | ||
![]() |
ece239966f | ||
![]() |
3b8bc7e64c | ||
![]() |
fc73b2b430 | ||
![]() |
901dba6063 | ||
![]() |
b88a7a4550 | ||
![]() |
106e40845f | ||
![]() |
0064bec8f5 | ||
![]() |
9e6dbb0b5a | ||
![]() |
d26e61388b | ||
![]() |
31a7084c75 | ||
![]() |
128612a6fc | ||
![]() |
6af3f46bc3 | ||
![]() |
d2cf8ef070 | ||
![]() |
259ad3cfe6 | ||
![]() |
18b320d577 | ||
![]() |
89e151f035 | ||
![]() |
22060f6410 | ||
![]() |
7ee3288460 | ||
![]() |
cbbc954a8c | ||
![]() |
2c425e9c69 | ||
![]() |
c59975ab05 | ||
![]() |
05f7004487 | ||
![]() |
2f9203cd2a | ||
![]() |
f09b33f2ef | ||
![]() |
65470b0ab1 | ||
![]() |
9a23fe662b | ||
![]() |
6d7ac09e96 | ||
![]() |
c2a39e3639 | ||
![]() |
ae625a4d00 | ||
![]() |
7f3a029596 | ||
![]() |
b34cf00819 | ||
![]() |
d4a10b4300 | ||
![]() |
9c74d74f7b | ||
![]() |
679ee7bea4 | ||
![]() |
77d7dc62c4 | ||
![]() |
699519d1fe | ||
![]() |
8faf39d34e | ||
![]() |
5d261a6fcd | ||
![]() |
22d5727089 | ||
![]() |
c965197d6f | ||
![]() |
994a6c4939 | ||
![]() |
f926d2a72b | ||
![]() |
ddeb9ed93e | ||
![]() |
c7e99c7b59 | ||
![]() |
6fabc92e56 | ||
![]() |
4645b3c919 | ||
![]() |
134fe2705c | ||
![]() |
3cca32ba7e | ||
![]() |
c069e61b26 | ||
![]() |
7fa159e164 | ||
![]() |
5f92025617 | ||
![]() |
333e1bc732 | ||
![]() |
e90b97c144 | ||
![]() |
747eeb1d46 | ||
![]() |
5d2c53abc0 | ||
![]() |
0b1e721242 | ||
![]() |
8c76a9ce99 | ||
![]() |
338321af5b | ||
![]() |
2774a92484 | ||
![]() |
1a6bfb41a1 | ||
![]() |
314981eaf8 | ||
![]() |
d7266c633d | ||
![]() |
eb4d5f2b95 | ||
![]() |
c63b449ad6 | ||
![]() |
dd4a778c2c | ||
![]() |
a0896d21d6 | ||
![]() |
0e697f951a | ||
![]() |
fa4bb9082d | ||
![]() |
8ff7b15441 | ||
![]() |
dd45f85a20 | ||
![]() |
decdd9e522 | ||
![]() |
31a21d4a2c | ||
![]() |
2c129843a7 | ||
![]() |
ce71a0bcfb | ||
![]() |
0a32c38317 | ||
![]() |
36f596f260 | ||
![]() |
953552545b | ||
![]() |
835e55b1de | ||
![]() |
dcd2921eaa | ||
![]() |
5e6459fd18 | ||
![]() |
50ddb3eb59 | ||
![]() |
5eebfee4b5 | ||
![]() |
567919ea90 | ||
![]() |
27a3997530 | ||
![]() |
192ba2c657 | ||
![]() |
92abac9ca8 | ||
![]() |
04ebbbd73a | ||
![]() |
55305e0d95 | ||
![]() |
67623639e4 | ||
![]() |
cc76def342 | ||
![]() |
4967fa5928 | ||
![]() |
2b98e4ec56 | ||
![]() |
fa1d058ee2 | ||
![]() |
a49a588bfa | ||
![]() |
ca7dda61c6 | ||
![]() |
ffedddd76d | ||
![]() |
766c76ae8e | ||
![]() |
3096ff33e9 | ||
![]() |
90a7451da4 | ||
![]() |
529a4b9ee8 | ||
![]() |
0567e104eb | ||
![]() |
ecbeacd022 | ||
![]() |
2772960e41 | ||
![]() |
1b694191e2 | ||
![]() |
69578a5f8f | ||
![]() |
7d96cfe72b | ||
![]() |
423514a5a5 | ||
![]() |
12568c7d6d | ||
![]() |
8d16a0a536 | ||
![]() |
87ca801f00 | ||
![]() |
e4ecbb6c30 | ||
![]() |
b1a67de2b9 | ||
![]() |
71a23910fe | ||
![]() |
0ede31f9cf | ||
![]() |
9f5dcf2d1e | ||
![]() |
e878556e98 | ||
![]() |
b096928172 | ||
![]() |
db7442ae67 | ||
![]() |
b6cd430e08 | ||
![]() |
478e50cda2 | ||
![]() |
1db2b9943c | ||
![]() |
ac41aa8b67 | ||
![]() |
156a98e2e7 | ||
![]() |
d88ec1209e | ||
![]() |
fde8dbfc80 | ||
![]() |
879dc73eba | ||
![]() |
1dfc52de16 | ||
![]() |
1331129485 | ||
![]() |
1cd98062e5 | ||
![]() |
9791d9b77a | ||
![]() |
8956452a45 | ||
![]() |
f3659fa49c | ||
![]() |
585f2be793 | ||
![]() |
d13f160222 | ||
![]() |
db5495b9d7 | ||
![]() |
3def1ae232 | ||
![]() |
c6ebead8e5 | ||
![]() |
cff4a950e0 | ||
![]() |
e4fa894153 | ||
![]() |
69caccfa82 | ||
![]() |
ab50c13160 | ||
![]() |
56d4e82b14 | ||
![]() |
09b5bd48bc | ||
![]() |
957dcfb6a9 | ||
![]() |
67f7bffd18 | ||
![]() |
de81b42b49 | ||
![]() |
06eb7e9fa7 | ||
![]() |
45bc1ac566 | ||
![]() |
02aafeff75 | ||
![]() |
6b46c52789 | ||
![]() |
d732e261a4 | ||
![]() |
807c574e91 | ||
![]() |
bb171a39b3 | ||
![]() |
941a4fc50e | ||
![]() |
afe65bd7bf | ||
![]() |
6f9762049c | ||
![]() |
122970d70d | ||
![]() |
8664b1c7a2 | ||
![]() |
c92166f38a | ||
![]() |
d616058b12 | ||
![]() |
a7b4001b75 | ||
![]() |
ff85f01459 | ||
![]() |
695f81a08b | ||
![]() |
326be287da | ||
![]() |
0404d98190 | ||
![]() |
0a8ec1eb22 | ||
![]() |
d860932dcd | ||
![]() |
1cb137bd2d | ||
![]() |
3c279e5568 | ||
![]() |
fb55e3df57 | ||
![]() |
de46fb6e2e | ||
![]() |
d7a0e3c5ea | ||
![]() |
0533ea817d | ||
![]() |
755e4fb5f4 | ||
![]() |
e4fdde158f | ||
![]() |
6d0712fa6d | ||
![]() |
bbbb28e3ca | ||
![]() |
3bf2e9d065 | ||
![]() |
1461fd8777 | ||
![]() |
054860539a | ||
![]() |
c87870b18e | ||
![]() |
5ad2be9c45 | ||
![]() |
61a24746a1 | ||
![]() |
d557eb9361 | ||
![]() |
a9a1a361a9 | ||
![]() |
12d070af80 | ||
![]() |
8d40557bc8 | ||
![]() |
5a5f3a899a | ||
![]() |
a2d1f133c8 | ||
![]() |
0ae6420c31 | ||
![]() |
3a3e05cf18 | ||
![]() |
6a20388e25 | ||
![]() |
06c836a937 | ||
![]() |
049a13fe78 | ||
![]() |
30bf6c962f | ||
![]() |
a72b3a23c3 | ||
![]() |
e9971b168a | ||
![]() |
5b59b5e0c1 | ||
![]() |
8cfd712428 | ||
![]() |
21f7faa80d | ||
![]() |
a6a0121118 | ||
![]() |
ba66aa33c5 | ||
![]() |
8fc024a770 | ||
![]() |
52aa9d08aa | ||
![]() |
4c9379c39e | ||
![]() |
0ff2c39364 | ||
![]() |
1af7e5dc49 | ||
![]() |
af3bb64e42 | ||
![]() |
77281f836e | ||
![]() |
550275811d | ||
![]() |
c27ce6c54d | ||
![]() |
ac4991b069 | ||
![]() |
25bee71bb8 | ||
![]() |
b993780a3b | ||
![]() |
ea0c9f1168 | ||
![]() |
08311f275a | ||
![]() |
4de0f2f737 | ||
![]() |
42ae807c41 | ||
![]() |
94593ba4c3 | ||
![]() |
6a6e1a0ea9 | ||
![]() |
5b19af99ff | ||
![]() |
28fb8e607a | ||
![]() |
bb85b6ef00 | ||
![]() |
b9b5a635ca | ||
![]() |
131ea5b627 | ||
![]() |
fac70e9642 | ||
![]() |
7e76ea40fb | ||
![]() |
de09ae42ef | ||
![]() |
6424f0666d | ||
![]() |
f3ae94ca70 | ||
![]() |
09c9f67a02 | ||
![]() |
c264ca542d | ||
![]() |
bbf30d416d | ||
![]() |
27617a1b06 | ||
![]() |
e84081769e | ||
![]() |
20119fc580 |
235 changed files with 11873 additions and 31408 deletions
5
.env
5
.env
|
@ -29,6 +29,9 @@
|
||||||
## Enable/Disable single backend (useful if only one GPU is available)
|
## Enable/Disable single backend (useful if only one GPU is available)
|
||||||
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
|
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
|
||||||
|
|
||||||
|
# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
|
||||||
|
# LOCALAI_FORCE_BACKEND_SHUTDOWN=true
|
||||||
|
|
||||||
## Specify a build type. Available: cublas, openblas, clblas.
|
## Specify a build type. Available: cublas, openblas, clblas.
|
||||||
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
|
## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
|
||||||
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
|
## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
|
||||||
|
@ -73,7 +76,7 @@
|
||||||
|
|
||||||
### Define a list of GRPC Servers for llama-cpp workers to distribute the load
|
### Define a list of GRPC Servers for llama-cpp workers to distribute the load
|
||||||
# https://github.com/ggerganov/llama.cpp/pull/6829
|
# https://github.com/ggerganov/llama.cpp/pull/6829
|
||||||
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
|
# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
|
||||||
# LLAMACPP_GRPC_SERVERS=""
|
# LLAMACPP_GRPC_SERVERS=""
|
||||||
|
|
||||||
### Enable to run parallel requests
|
### Enable to run parallel requests
|
||||||
|
|
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
|
@ -29,10 +29,6 @@ updates:
|
||||||
schedule:
|
schedule:
|
||||||
# Check for updates to GitHub Actions every weekday
|
# Check for updates to GitHub Actions every weekday
|
||||||
interval: "weekly"
|
interval: "weekly"
|
||||||
- package-ecosystem: "pip"
|
|
||||||
directory: "/backend/python/autogptq"
|
|
||||||
schedule:
|
|
||||||
interval: "weekly"
|
|
||||||
- package-ecosystem: "pip"
|
- package-ecosystem: "pip"
|
||||||
directory: "/backend/python/bark"
|
directory: "/backend/python/bark"
|
||||||
schedule:
|
schedule:
|
||||||
|
|
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
|
@ -9,10 +9,10 @@ jobs:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- repository: "ggerganov/llama.cpp"
|
- repository: "ggml-org/llama.cpp"
|
||||||
variable: "CPPLLAMA_VERSION"
|
variable: "CPPLLAMA_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
- repository: "ggerganov/whisper.cpp"
|
- repository: "ggml-org/whisper.cpp"
|
||||||
variable: "WHISPER_CPP_VERSION"
|
variable: "WHISPER_CPP_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
- repository: "PABannier/bark.cpp"
|
- repository: "PABannier/bark.cpp"
|
||||||
|
|
2
.github/workflows/dependabot_auto.yml
vendored
2
.github/workflows/dependabot_auto.yml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Dependabot metadata
|
- name: Dependabot metadata
|
||||||
id: metadata
|
id: metadata
|
||||||
uses: dependabot/fetch-metadata@v2.3.0
|
uses: dependabot/fetch-metadata@v2.4.0
|
||||||
with:
|
with:
|
||||||
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
||||||
skip-commit-verification: true
|
skip-commit-verification: true
|
||||||
|
|
6
.github/workflows/deploy-explorer.yaml
vendored
6
.github/workflows/deploy-explorer.yaml
vendored
|
@ -33,7 +33,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
CGO_ENABLED=0 make build-api
|
CGO_ENABLED=0 make build-api
|
||||||
- name: rm
|
- name: rm
|
||||||
uses: appleboy/ssh-action@v1.2.0
|
uses: appleboy/ssh-action@v1.2.2
|
||||||
with:
|
with:
|
||||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||||
|
@ -42,7 +42,7 @@ jobs:
|
||||||
script: |
|
script: |
|
||||||
sudo rm -rf local-ai/ || true
|
sudo rm -rf local-ai/ || true
|
||||||
- name: copy file via ssh
|
- name: copy file via ssh
|
||||||
uses: appleboy/scp-action@v0.1.7
|
uses: appleboy/scp-action@v1.0.0
|
||||||
with:
|
with:
|
||||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||||
|
@ -53,7 +53,7 @@ jobs:
|
||||||
rm: true
|
rm: true
|
||||||
target: ./local-ai
|
target: ./local-ai
|
||||||
- name: restarting
|
- name: restarting
|
||||||
uses: appleboy/ssh-action@v1.2.0
|
uses: appleboy/ssh-action@v1.2.2
|
||||||
with:
|
with:
|
||||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||||
|
|
2
.github/workflows/generate_intel_image.yaml
vendored
2
.github/workflows/generate_intel_image.yaml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
|
- base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
|
||||||
runs-on: 'ubuntu-latest'
|
runs-on: 'ubuntu-latest'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
runs-on: ${{matrix.runs-on}}
|
runs-on: ${{matrix.runs-on}}
|
||||||
|
|
50
.github/workflows/image-pr.yml
vendored
50
.github/workflows/image-pr.yml
vendored
|
@ -33,6 +33,7 @@ jobs:
|
||||||
# Pushing with all jobs in parallel
|
# Pushing with all jobs in parallel
|
||||||
# eats the bandwidth of all the nodes
|
# eats the bandwidth of all the nodes
|
||||||
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
|
max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
|
||||||
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# This is basically covered by the AIO test
|
# This is basically covered by the AIO test
|
||||||
|
@ -56,26 +57,35 @@ jobs:
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
# - build-type: 'hipblas'
|
- build-type: 'hipblas'
|
||||||
# platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
# tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
# tag-suffix: '-hipblas'
|
tag-suffix: '-hipblas'
|
||||||
# ffmpeg: 'false'
|
ffmpeg: 'false'
|
||||||
# image-type: 'extras'
|
image-type: 'extras'
|
||||||
# base-image: "rocm/dev-ubuntu-22.04:6.1"
|
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||||
# grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
# runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
# makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
# - build-type: 'sycl_f16'
|
- build-type: 'sycl_f16'
|
||||||
# platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
# tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
# base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||||
# grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
# tag-suffix: 'sycl-f16-ffmpeg'
|
tag-suffix: 'sycl-f16-ffmpeg'
|
||||||
# ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
# image-type: 'extras'
|
image-type: 'extras'
|
||||||
# runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
# makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'false'
|
||||||
|
tag-suffix: '-vulkan-ffmpeg-core'
|
||||||
|
ffmpeg: 'true'
|
||||||
|
image-type: 'core'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:22.04"
|
||||||
|
makeflags: "--jobs=4 --output-sync=target"
|
||||||
# core-image-build:
|
# core-image-build:
|
||||||
# uses: ./.github/workflows/image_build.yml
|
# uses: ./.github/workflows/image_build.yml
|
||||||
# with:
|
# with:
|
||||||
|
|
167
.github/workflows/image.yml
vendored
167
.github/workflows/image.yml
vendored
|
@ -45,13 +45,13 @@ jobs:
|
||||||
- build-type: 'hipblas'
|
- build-type: 'hipblas'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'auto'
|
tag-latest: 'auto'
|
||||||
tag-suffix: '-hipblas-ffmpeg'
|
tag-suffix: '-hipblas-extras'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'extras'
|
image-type: 'extras'
|
||||||
aio: "-aio-gpu-hipblas"
|
aio: "-aio-gpu-hipblas"
|
||||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
latest-image: 'latest-gpu-hipblas'
|
latest-image: 'latest-gpu-hipblas-extras'
|
||||||
latest-image-aio: 'latest-aio-gpu-hipblas'
|
latest-image-aio: 'latest-aio-gpu-hipblas'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
|
@ -59,32 +59,13 @@ jobs:
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-hipblas'
|
tag-suffix: '-hipblas'
|
||||||
ffmpeg: 'false'
|
|
||||||
image-type: 'extras'
|
|
||||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
|
||||||
grpc-base-image: "ubuntu:22.04"
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'hipblas'
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
tag-suffix: '-hipblas-ffmpeg-core'
|
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
- build-type: 'hipblas'
|
latest-image: 'latest-gpu-hipblas'
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
tag-suffix: '-hipblas-core'
|
|
||||||
ffmpeg: 'false'
|
|
||||||
image-type: 'core'
|
|
||||||
base-image: "rocm/dev-ubuntu-22.04:6.1"
|
|
||||||
grpc-base-image: "ubuntu:22.04"
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
self-hosted-jobs:
|
self-hosted-jobs:
|
||||||
uses: ./.github/workflows/image_build.yml
|
uses: ./.github/workflows/image_build.yml
|
||||||
with:
|
with:
|
||||||
|
@ -114,110 +95,58 @@ jobs:
|
||||||
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
|
max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# Extra images
|
|
||||||
- build-type: ''
|
|
||||||
#platforms: 'linux/amd64,linux/arm64'
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'auto'
|
|
||||||
tag-suffix: ''
|
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'extras'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: ''
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'auto'
|
|
||||||
tag-suffix: '-ffmpeg'
|
|
||||||
ffmpeg: 'true'
|
|
||||||
image-type: 'extras'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "11"
|
cuda-major-version: "11"
|
||||||
cuda-minor-version: "7"
|
cuda-minor-version: "7"
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-cublas-cuda11'
|
tag-suffix: '-cublas-cuda11-extras'
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'extras'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'cublas'
|
|
||||||
cuda-major-version: "12"
|
|
||||||
cuda-minor-version: "0"
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
tag-suffix: '-cublas-cuda12'
|
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'extras'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'cublas'
|
|
||||||
cuda-major-version: "11"
|
|
||||||
cuda-minor-version: "7"
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'auto'
|
|
||||||
tag-suffix: '-cublas-cuda11-ffmpeg'
|
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'extras'
|
image-type: 'extras'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
aio: "-aio-gpu-nvidia-cuda-11"
|
aio: "-aio-gpu-nvidia-cuda-11"
|
||||||
latest-image: 'latest-gpu-nvidia-cuda-11'
|
latest-image: 'latest-gpu-nvidia-cuda-11-extras'
|
||||||
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
|
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "12"
|
cuda-major-version: "12"
|
||||||
cuda-minor-version: "0"
|
cuda-minor-version: "0"
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'auto'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-cublas-cuda12-ffmpeg'
|
tag-suffix: '-cublas-cuda12-extras'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'extras'
|
image-type: 'extras'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
aio: "-aio-gpu-nvidia-cuda-12"
|
aio: "-aio-gpu-nvidia-cuda-12"
|
||||||
latest-image: 'latest-gpu-nvidia-cuda-12'
|
latest-image: 'latest-gpu-nvidia-cuda-12-extras'
|
||||||
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
|
latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
- build-type: ''
|
|
||||||
#platforms: 'linux/amd64,linux/arm64'
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'auto'
|
|
||||||
tag-suffix: ''
|
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'extras'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'sycl_f16'
|
- build-type: 'sycl_f16'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'auto'
|
tag-latest: 'false'
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
tag-suffix: '-sycl-f16-ffmpeg'
|
tag-suffix: '-sycl-f16-extras'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'extras'
|
image-type: 'extras'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
aio: "-aio-gpu-intel-f16"
|
aio: "-aio-gpu-intel-f16"
|
||||||
latest-image: 'latest-gpu-intel-f16'
|
latest-image: 'latest-gpu-intel-f16-extras'
|
||||||
latest-image-aio: 'latest-aio-gpu-intel-f16'
|
latest-image-aio: 'latest-aio-gpu-intel-f16'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
- build-type: 'sycl_f32'
|
- build-type: 'sycl_f32'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'auto'
|
tag-latest: 'false'
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
tag-suffix: '-sycl-f32-ffmpeg'
|
tag-suffix: '-sycl-f32-extras'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'extras'
|
image-type: 'extras'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
aio: "-aio-gpu-intel-f32"
|
aio: "-aio-gpu-intel-f32"
|
||||||
latest-image: 'latest-gpu-intel-f32'
|
latest-image: 'latest-gpu-intel-f32-extras'
|
||||||
latest-image-aio: 'latest-aio-gpu-intel-f32'
|
latest-image-aio: 'latest-aio-gpu-intel-f32'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
# Core images
|
# Core images
|
||||||
|
@ -226,41 +155,23 @@ jobs:
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
tag-suffix: '-sycl-f16-core'
|
tag-suffix: '-sycl-f16'
|
||||||
ffmpeg: 'false'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
|
latest-image: 'latest-gpu-intel-f16'
|
||||||
- build-type: 'sycl_f32'
|
- build-type: 'sycl_f32'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
||||||
grpc-base-image: "ubuntu:22.04"
|
grpc-base-image: "ubuntu:22.04"
|
||||||
tag-suffix: '-sycl-f32-core'
|
tag-suffix: '-sycl-f32'
|
||||||
ffmpeg: 'false'
|
|
||||||
image-type: 'core'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'sycl_f16'
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
|
||||||
grpc-base-image: "ubuntu:22.04"
|
|
||||||
tag-suffix: '-sycl-f16-ffmpeg-core'
|
|
||||||
ffmpeg: 'true'
|
|
||||||
image-type: 'core'
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
|
||||||
- build-type: 'sycl_f32'
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
|
|
||||||
grpc-base-image: "ubuntu:22.04"
|
|
||||||
tag-suffix: '-sycl-f32-ffmpeg-core'
|
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
makeflags: "--jobs=3 --output-sync=target"
|
makeflags: "--jobs=3 --output-sync=target"
|
||||||
|
latest-image: 'latest-gpu-intel-f32'
|
||||||
|
|
||||||
core-image-build:
|
core-image-build:
|
||||||
uses: ./.github/workflows/image_build.yml
|
uses: ./.github/workflows/image_build.yml
|
||||||
|
@ -293,7 +204,7 @@ jobs:
|
||||||
- build-type: ''
|
- build-type: ''
|
||||||
platforms: 'linux/amd64,linux/arm64'
|
platforms: 'linux/amd64,linux/arm64'
|
||||||
tag-latest: 'auto'
|
tag-latest: 'auto'
|
||||||
tag-suffix: '-ffmpeg-core'
|
tag-suffix: ''
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
|
@ -308,60 +219,38 @@ jobs:
|
||||||
cuda-minor-version: "7"
|
cuda-minor-version: "7"
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-cublas-cuda11-core'
|
tag-suffix: '-cublas-cuda11'
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'core'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=4 --output-sync=target"
|
|
||||||
skip-drivers: 'false'
|
|
||||||
- build-type: 'cublas'
|
|
||||||
cuda-major-version: "12"
|
|
||||||
cuda-minor-version: "0"
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
tag-suffix: '-cublas-cuda12-core'
|
|
||||||
ffmpeg: ''
|
|
||||||
image-type: 'core'
|
|
||||||
base-image: "ubuntu:22.04"
|
|
||||||
runs-on: 'arc-runner-set'
|
|
||||||
makeflags: "--jobs=4 --output-sync=target"
|
|
||||||
skip-drivers: 'false'
|
|
||||||
- build-type: 'cublas'
|
|
||||||
cuda-major-version: "11"
|
|
||||||
cuda-minor-version: "7"
|
|
||||||
platforms: 'linux/amd64'
|
|
||||||
tag-latest: 'false'
|
|
||||||
tag-suffix: '-cublas-cuda11-ffmpeg-core'
|
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
makeflags: "--jobs=4 --output-sync=target"
|
makeflags: "--jobs=4 --output-sync=target"
|
||||||
skip-drivers: 'false'
|
skip-drivers: 'false'
|
||||||
|
latest-image: 'latest-gpu-nvidia-cuda-12'
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "12"
|
cuda-major-version: "12"
|
||||||
cuda-minor-version: "0"
|
cuda-minor-version: "0"
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-cublas-cuda12-ffmpeg-core'
|
tag-suffix: '-cublas-cuda12'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
skip-drivers: 'false'
|
skip-drivers: 'false'
|
||||||
makeflags: "--jobs=4 --output-sync=target"
|
makeflags: "--jobs=4 --output-sync=target"
|
||||||
|
latest-image: 'latest-gpu-nvidia-cuda-12'
|
||||||
- build-type: 'vulkan'
|
- build-type: 'vulkan'
|
||||||
platforms: 'linux/amd64'
|
platforms: 'linux/amd64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-vulkan-ffmpeg-core'
|
tag-suffix: '-vulkan'
|
||||||
latest-image: 'latest-vulkan-ffmpeg-core'
|
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
runs-on: 'arc-runner-set'
|
runs-on: 'arc-runner-set'
|
||||||
base-image: "ubuntu:22.04"
|
base-image: "ubuntu:22.04"
|
||||||
skip-drivers: 'false'
|
skip-drivers: 'false'
|
||||||
makeflags: "--jobs=4 --output-sync=target"
|
makeflags: "--jobs=4 --output-sync=target"
|
||||||
|
latest-image: 'latest-gpu-vulkan'
|
||||||
gh-runner:
|
gh-runner:
|
||||||
uses: ./.github/workflows/image_build.yml
|
uses: ./.github/workflows/image_build.yml
|
||||||
with:
|
with:
|
||||||
|
@ -394,8 +283,8 @@ jobs:
|
||||||
cuda-minor-version: "0"
|
cuda-minor-version: "0"
|
||||||
platforms: 'linux/arm64'
|
platforms: 'linux/arm64'
|
||||||
tag-latest: 'false'
|
tag-latest: 'false'
|
||||||
tag-suffix: '-nvidia-l4t-arm64-core'
|
tag-suffix: '-nvidia-l4t-arm64'
|
||||||
latest-image: 'latest-nvidia-l4t-arm64-core'
|
latest-image: 'latest-nvidia-l4t-arm64'
|
||||||
ffmpeg: 'true'
|
ffmpeg: 'true'
|
||||||
image-type: 'core'
|
image-type: 'core'
|
||||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
|
5
.github/workflows/image_build.yml
vendored
5
.github/workflows/image_build.yml
vendored
|
@ -310,6 +310,11 @@ jobs:
|
||||||
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
|
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
|
||||||
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
|
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
|
||||||
|
|
||||||
|
- name: Cleanup
|
||||||
|
run: |
|
||||||
|
docker builder prune -f
|
||||||
|
docker system prune --force --volumes --all
|
||||||
|
|
||||||
- name: Latest tag
|
- name: Latest tag
|
||||||
# run this on branches, when it is a tag and there is a latest-image defined
|
# run this on branches, when it is a tag and there is a latest-image defined
|
||||||
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
|
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
|
||||||
|
|
10
.github/workflows/notify-models.yaml
vendored
10
.github/workflows/notify-models.yaml
vendored
|
@ -8,7 +8,7 @@ jobs:
|
||||||
notify-discord:
|
notify-discord:
|
||||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||||
env:
|
env:
|
||||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
MODEL_NAME: gemma-3-12b-it
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -16,7 +16,7 @@ jobs:
|
||||||
fetch-depth: 0 # needed to checkout all branches for this Action to work
|
fetch-depth: 0 # needed to checkout all branches for this Action to work
|
||||||
- uses: mudler/localai-github-action@v1
|
- uses: mudler/localai-github-action@v1
|
||||||
with:
|
with:
|
||||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||||
# Check the PR diff using the current branch and the base branch of the PR
|
# Check the PR diff using the current branch and the base branch of the PR
|
||||||
- uses: GrantBirki/git-diff-action@v2.8.0
|
- uses: GrantBirki/git-diff-action@v2.8.0
|
||||||
id: git-diff-action
|
id: git-diff-action
|
||||||
|
@ -79,7 +79,7 @@ jobs:
|
||||||
args: ${{ steps.summarize.outputs.message }}
|
args: ${{ steps.summarize.outputs.message }}
|
||||||
- name: Setup tmate session if fails
|
- name: Setup tmate session if fails
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -87,7 +87,7 @@ jobs:
|
||||||
notify-twitter:
|
notify-twitter:
|
||||||
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
|
||||||
env:
|
env:
|
||||||
MODEL_NAME: hermes-2-theta-llama-3-8b
|
MODEL_NAME: gemma-3-12b-it
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -161,7 +161,7 @@ jobs:
|
||||||
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
|
TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
|
||||||
- name: Setup tmate session if fails
|
- name: Setup tmate session if fails
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
|
4
.github/workflows/notify-releases.yaml
vendored
4
.github/workflows/notify-releases.yaml
vendored
|
@ -14,7 +14,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: mudler/localai-github-action@v1
|
- uses: mudler/localai-github-action@v1
|
||||||
with:
|
with:
|
||||||
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
|
||||||
- name: Summarize
|
- name: Summarize
|
||||||
id: summarize
|
id: summarize
|
||||||
run: |
|
run: |
|
||||||
|
@ -60,4 +60,4 @@ jobs:
|
||||||
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
|
DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
|
||||||
uses: Ilshidur/action-discord@master
|
uses: Ilshidur/action-discord@master
|
||||||
with:
|
with:
|
||||||
args: ${{ steps.summarize.outputs.message }}
|
args: ${{ steps.summarize.outputs.message }}
|
||||||
|
|
16
.github/workflows/release.yaml
vendored
16
.github/workflows/release.yaml
vendored
|
@ -36,6 +36,7 @@ jobs:
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
|
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
|
||||||
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
|
sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
|
||||||
|
make install-go-tools
|
||||||
- name: Install CUDA Dependencies
|
- name: Install CUDA Dependencies
|
||||||
run: |
|
run: |
|
||||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
|
||||||
|
@ -123,7 +124,7 @@ jobs:
|
||||||
release/*
|
release/*
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -151,6 +152,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
|
sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
|
||||||
|
make install-go-tools
|
||||||
- name: Intel Dependencies
|
- name: Intel Dependencies
|
||||||
run: |
|
run: |
|
||||||
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
|
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
|
||||||
|
@ -232,7 +234,7 @@ jobs:
|
||||||
release/*
|
release/*
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -253,8 +255,7 @@ jobs:
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
run: |
|
run: |
|
||||||
brew install protobuf grpc
|
brew install protobuf grpc
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
|
make install-go-tools
|
||||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: build
|
id: build
|
||||||
run: |
|
run: |
|
||||||
|
@ -275,7 +276,7 @@ jobs:
|
||||||
release/*
|
release/*
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -295,8 +296,7 @@ jobs:
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
run: |
|
run: |
|
||||||
brew install protobuf grpc libomp llvm
|
brew install protobuf grpc libomp llvm
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
make install-go-tools
|
||||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: build
|
id: build
|
||||||
run: |
|
run: |
|
||||||
|
@ -317,7 +317,7 @@ jobs:
|
||||||
release/*
|
release/*
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
|
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
- name: Run Gosec Security Scanner
|
- name: Run Gosec Security Scanner
|
||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
uses: securego/gosec@v2.22.0
|
uses: securego/gosec@v2.22.4
|
||||||
with:
|
with:
|
||||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||||
|
|
20
.github/workflows/test-extra.yml
vendored
20
.github/workflows/test-extra.yml
vendored
|
@ -78,6 +78,26 @@ jobs:
|
||||||
make --jobs=5 --output-sync=target -C backend/python/diffusers
|
make --jobs=5 --output-sync=target -C backend/python/diffusers
|
||||||
make --jobs=5 --output-sync=target -C backend/python/diffusers test
|
make --jobs=5 --output-sync=target -C backend/python/diffusers test
|
||||||
|
|
||||||
|
#tests-vllm:
|
||||||
|
# runs-on: ubuntu-latest
|
||||||
|
# steps:
|
||||||
|
# - name: Clone
|
||||||
|
# uses: actions/checkout@v4
|
||||||
|
# with:
|
||||||
|
# submodules: true
|
||||||
|
# - name: Dependencies
|
||||||
|
# run: |
|
||||||
|
# sudo apt-get update
|
||||||
|
# sudo apt-get install -y build-essential ffmpeg
|
||||||
|
# sudo apt-get install -y ca-certificates cmake curl patch python3-pip
|
||||||
|
# sudo apt-get install -y libopencv-dev
|
||||||
|
# # Install UV
|
||||||
|
# curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
# pip install --user --no-cache-dir grpcio-tools==1.64.1
|
||||||
|
# - name: Test vllm backend
|
||||||
|
# run: |
|
||||||
|
# make --jobs=5 --output-sync=target -C backend/python/vllm
|
||||||
|
# make --jobs=5 --output-sync=target -C backend/python/vllm test
|
||||||
# tests-transformers-musicgen:
|
# tests-transformers-musicgen:
|
||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
# steps:
|
# steps:
|
||||||
|
|
11
.github/workflows/test.yml
vendored
11
.github/workflows/test.yml
vendored
|
@ -71,7 +71,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
|
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
|
||||||
sudo apt-get install -y libgmock-dev
|
sudo apt-get install -y libgmock-dev clang
|
||||||
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
|
curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
|
||||||
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
|
sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
|
||||||
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
|
gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
|
||||||
|
@ -96,6 +96,7 @@ jobs:
|
||||||
|
|
||||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||||
|
go install github.com/GeertJohan/go.rice/rice@latest
|
||||||
|
|
||||||
# The python3-grpc-tools package in 22.04 is too old
|
# The python3-grpc-tools package in 22.04 is too old
|
||||||
pip install --user grpcio-tools
|
pip install --user grpcio-tools
|
||||||
|
@ -130,7 +131,7 @@ jobs:
|
||||||
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -183,6 +184,7 @@ jobs:
|
||||||
rm protoc.zip
|
rm protoc.zip
|
||||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||||
|
go install github.com/GeertJohan/go.rice/rice@latest
|
||||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||||
- name: Build images
|
- name: Build images
|
||||||
run: |
|
run: |
|
||||||
|
@ -194,7 +196,7 @@ jobs:
|
||||||
make run-e2e-aio
|
make run-e2e-aio
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
@ -222,6 +224,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
|
brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
|
||||||
pip install --user --no-cache-dir grpcio-tools
|
pip install --user --no-cache-dir grpcio-tools
|
||||||
|
go install github.com/GeertJohan/go.rice/rice@latest
|
||||||
- name: Test
|
- name: Test
|
||||||
run: |
|
run: |
|
||||||
export C_INCLUDE_PATH=/usr/local/include
|
export C_INCLUDE_PATH=/usr/local/include
|
||||||
|
@ -232,7 +235,7 @@ jobs:
|
||||||
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
|
||||||
- name: Setup tmate session if tests fail
|
- name: Setup tmate session if tests fail
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
uses: mxschmitt/action-tmate@v3.19
|
uses: mxschmitt/action-tmate@v3.22
|
||||||
with:
|
with:
|
||||||
detached: true
|
detached: true
|
||||||
connect-timeout-seconds: 180
|
connect-timeout-seconds: 180
|
||||||
|
|
18
Dockerfile
18
Dockerfile
|
@ -15,7 +15,7 @@ ARG TARGETARCH
|
||||||
ARG TARGETVARIANT
|
ARG TARGETVARIANT
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
|
@ -24,6 +24,7 @@ RUN apt-get update && \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
curl libssl-dev \
|
curl libssl-dev \
|
||||||
git \
|
git \
|
||||||
|
git-lfs \
|
||||||
unzip upx-ucl && \
|
unzip upx-ucl && \
|
||||||
apt-get clean && \
|
apt-get clean && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
@ -45,9 +46,10 @@ EOT
|
||||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||||
|
|
||||||
# Install grpc compilers
|
# Install grpc compilers and rice
|
||||||
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
|
RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
|
||||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
|
||||||
|
go install github.com/GeertJohan/go.rice/rice@latest
|
||||||
|
|
||||||
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
|
COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
|
||||||
RUN update-ca-certificates
|
RUN update-ca-certificates
|
||||||
|
@ -299,10 +301,9 @@ COPY .git .
|
||||||
RUN make prepare
|
RUN make prepare
|
||||||
|
|
||||||
## Build the binary
|
## Build the binary
|
||||||
## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
|
## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
|
||||||
## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
|
## Otherwise just run the normal build
|
||||||
## (both will use CUDA or hipblas for the actual computation)
|
RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
||||||
RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
|
|
||||||
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
|
||||||
else \
|
else \
|
||||||
make build; \
|
make build; \
|
||||||
|
@ -430,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
|
||||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||||
make -C backend/python/vllm \
|
make -C backend/python/vllm \
|
||||||
; fi && \
|
; fi && \
|
||||||
if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
|
||||||
make -C backend/python/autogptq \
|
|
||||||
; fi && \
|
|
||||||
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||||
make -C backend/python/bark \
|
make -C backend/python/bark \
|
||||||
; fi && \
|
; fi && \
|
||||||
|
|
102
Makefile
102
Makefile
|
@ -6,11 +6,11 @@ BINARY_NAME=local-ai
|
||||||
DETECT_LIBS?=true
|
DETECT_LIBS?=true
|
||||||
|
|
||||||
# llama.cpp versions
|
# llama.cpp versions
|
||||||
CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
|
CPPLLAMA_VERSION?=6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c
|
||||||
|
|
||||||
# whisper.cpp version
|
# whisper.cpp version
|
||||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||||
WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
|
WHISPER_CPP_VERSION?=d1f114da61b1ae1e70b03104fad42c9dd666feeb
|
||||||
|
|
||||||
# go-piper version
|
# go-piper version
|
||||||
PIPER_REPO?=https://github.com/mudler/go-piper
|
PIPER_REPO?=https://github.com/mudler/go-piper
|
||||||
|
@ -21,8 +21,11 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
|
||||||
BARKCPP_VERSION?=v1.0.0
|
BARKCPP_VERSION?=v1.0.0
|
||||||
|
|
||||||
# stablediffusion.cpp (ggml)
|
# stablediffusion.cpp (ggml)
|
||||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
|
||||||
STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
|
STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
|
||||||
|
|
||||||
|
# ONEAPI variables for SYCL
|
||||||
|
export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
ONNX_VERSION?=1.20.0
|
ONNX_VERSION?=1.20.0
|
||||||
ONNX_ARCH?=x64
|
ONNX_ARCH?=x64
|
||||||
|
@ -30,8 +33,12 @@ ONNX_OS?=linux
|
||||||
|
|
||||||
export BUILD_TYPE?=
|
export BUILD_TYPE?=
|
||||||
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
|
export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
|
||||||
export CMAKE_ARGS?=
|
export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
|
||||||
export BACKEND_LIBS?=
|
export BACKEND_LIBS?=
|
||||||
|
export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
|
||||||
|
export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
|
||||||
|
export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
|
||||||
|
|
||||||
CGO_LDFLAGS?=
|
CGO_LDFLAGS?=
|
||||||
CGO_LDFLAGS_WHISPER?=
|
CGO_LDFLAGS_WHISPER?=
|
||||||
|
@ -81,6 +88,7 @@ endif
|
||||||
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
|
# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
|
||||||
ifeq ($(NATIVE),false)
|
ifeq ($(NATIVE),false)
|
||||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Detect if we are running on arm64
|
# Detect if we are running on arm64
|
||||||
|
@ -108,13 +116,31 @@ ifeq ($(OS),Darwin)
|
||||||
# disable metal if on Darwin and any other value is explicitly passed.
|
# disable metal if on Darwin and any other value is explicitly passed.
|
||||||
else ifneq ($(BUILD_TYPE),metal)
|
else ifneq ($(BUILD_TYPE),metal)
|
||||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
|
||||||
export GGML_NO_ACCELERATE=1
|
export GGML_NO_ACCELERATE=1
|
||||||
export GGML_NO_METAL=1
|
export GGML_NO_METAL=1
|
||||||
|
GO_LDFLAGS_WHISPER+=-lggml-blas
|
||||||
|
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),metal)
|
ifeq ($(BUILD_TYPE),metal)
|
||||||
# -lcblas removed: it seems to always be listed as a duplicate flag.
|
|
||||||
CGO_LDFLAGS += -framework Accelerate
|
CGO_LDFLAGS += -framework Accelerate
|
||||||
|
CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
|
||||||
|
CMAKE_ARGS+=-DGGML_METAL=ON
|
||||||
|
CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
|
||||||
|
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||||
|
CMAKE_ARGS+=-DGGML_OPENMP=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||||
|
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
|
||||||
|
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||||
|
else
|
||||||
|
CGO_LDFLAGS_WHISPER+=-lggml-blas
|
||||||
|
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CGO_LDFLAGS_WHISPER+=-lgomp
|
CGO_LDFLAGS_WHISPER+=-lgomp
|
||||||
|
@ -126,21 +152,29 @@ ifeq ($(BUILD_TYPE),openblas)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),cublas)
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
|
CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
|
||||||
export GGML_CUDA=1
|
export GGML_CUDA=1
|
||||||
CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
|
CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
|
||||||
|
CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
|
||||||
|
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),vulkan)
|
ifeq ($(BUILD_TYPE),vulkan)
|
||||||
CMAKE_ARGS+=-DGGML_VULKAN=1
|
CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||||
|
WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
|
||||||
|
CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
|
||||||
|
export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||||
export GGML_SYCL=1
|
export GGML_SYCL=1
|
||||||
|
CMAKE_ARGS+=-DGGML_SYCL=ON
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||||
export GGML_SYCL_F16=1
|
export GGML_SYCL_F16=1
|
||||||
|
CMAKE_ARGS+=-DGGML_SYCL_F16=ON
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),hipblas)
|
ifeq ($(BUILD_TYPE),hipblas)
|
||||||
|
@ -151,7 +185,7 @@ ifeq ($(BUILD_TYPE),hipblas)
|
||||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||||
export STABLE_BUILD_TYPE=
|
export STABLE_BUILD_TYPE=
|
||||||
export GGML_HIP=1
|
export GGML_HIP=1
|
||||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
|
||||||
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
|
AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
|
||||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
|
||||||
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
|
CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
|
||||||
|
@ -260,11 +294,7 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
|
||||||
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
|
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
|
||||||
|
|
||||||
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
|
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
|
$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
|
|
||||||
ifneq ($(UPX),)
|
|
||||||
$(UPX) backend-assets/grpc/stablediffusion-ggml
|
|
||||||
endif
|
|
||||||
|
|
||||||
sources/onnxruntime:
|
sources/onnxruntime:
|
||||||
mkdir -p sources/onnxruntime
|
mkdir -p sources/onnxruntime
|
||||||
|
@ -290,8 +320,9 @@ sources/whisper.cpp:
|
||||||
git checkout $(WHISPER_CPP_VERSION) && \
|
git checkout $(WHISPER_CPP_VERSION) && \
|
||||||
git submodule update --init --recursive --depth 1 --single-branch
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
|
|
||||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
|
||||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
|
||||||
|
cd sources/whisper.cpp/build && cmake --build . --config Release
|
||||||
|
|
||||||
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||||
|
|
||||||
|
@ -341,8 +372,14 @@ clean-tests:
|
||||||
clean-dc: clean
|
clean-dc: clean
|
||||||
cp -r /build/backend-assets /workspace/backend-assets
|
cp -r /build/backend-assets /workspace/backend-assets
|
||||||
|
|
||||||
|
## Install Go tools
|
||||||
|
install-go-tools:
|
||||||
|
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||||
|
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||||
|
go install github.com/GeertJohan/go.rice/rice@latest
|
||||||
|
|
||||||
## Build:
|
## Build:
|
||||||
build: prepare backend-assets grpcs ## Build the project
|
build: prepare backend-assets grpcs install-go-tools ## Build the project
|
||||||
$(info ${GREEN}I local-ai build info:${RESET})
|
$(info ${GREEN}I local-ai build info:${RESET})
|
||||||
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
|
$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
|
||||||
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
|
$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
|
||||||
|
@ -352,7 +389,9 @@ ifneq ($(BACKEND_LIBS),)
|
||||||
$(MAKE) backend-assets/lib
|
$(MAKE) backend-assets/lib
|
||||||
cp -f $(BACKEND_LIBS) backend-assets/lib/
|
cp -f $(BACKEND_LIBS) backend-assets/lib/
|
||||||
endif
|
endif
|
||||||
|
rm -rf $(BINARY_NAME) || true
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
|
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
|
||||||
|
rice append --exec $(BINARY_NAME)
|
||||||
|
|
||||||
build-minimal:
|
build-minimal:
|
||||||
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
|
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
|
||||||
|
@ -424,6 +463,7 @@ prepare-test: grpcs
|
||||||
cp -rf backend-assets core/http
|
cp -rf backend-assets core/http
|
||||||
cp tests/models_fixtures/* test-models
|
cp tests/models_fixtures/* test-models
|
||||||
|
|
||||||
|
## Test targets
|
||||||
test: prepare test-models/testmodel.ggml grpcs
|
test: prepare test-models/testmodel.ggml grpcs
|
||||||
@echo 'Running tests'
|
@echo 'Running tests'
|
||||||
export GO_TAGS="tts debug"
|
export GO_TAGS="tts debug"
|
||||||
|
@ -498,7 +538,7 @@ protogen: protogen-go protogen-python
|
||||||
protogen-clean: protogen-go-clean protogen-python-clean
|
protogen-clean: protogen-go-clean protogen-python-clean
|
||||||
|
|
||||||
.PHONY: protogen-go
|
.PHONY: protogen-go
|
||||||
protogen-go:
|
protogen-go: install-go-tools
|
||||||
mkdir -p pkg/grpc/proto
|
mkdir -p pkg/grpc/proto
|
||||||
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
|
||||||
backend/backend.proto
|
backend/backend.proto
|
||||||
|
@ -509,18 +549,10 @@ protogen-go-clean:
|
||||||
$(RM) bin/*
|
$(RM) bin/*
|
||||||
|
|
||||||
.PHONY: protogen-python
|
.PHONY: protogen-python
|
||||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
|
||||||
|
|
||||||
.PHONY: protogen-python-clean
|
.PHONY: protogen-python-clean
|
||||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
|
||||||
|
|
||||||
.PHONY: autogptq-protogen
|
|
||||||
autogptq-protogen:
|
|
||||||
$(MAKE) -C backend/python/autogptq protogen
|
|
||||||
|
|
||||||
.PHONY: autogptq-protogen-clean
|
|
||||||
autogptq-protogen-clean:
|
|
||||||
$(MAKE) -C backend/python/autogptq protogen-clean
|
|
||||||
|
|
||||||
.PHONY: bark-protogen
|
.PHONY: bark-protogen
|
||||||
bark-protogen:
|
bark-protogen:
|
||||||
|
@ -597,7 +629,6 @@ vllm-protogen-clean:
|
||||||
## GRPC
|
## GRPC
|
||||||
# Note: it is duplicated in the Dockerfile
|
# Note: it is duplicated in the Dockerfile
|
||||||
prepare-extra-conda-environments: protogen-python
|
prepare-extra-conda-environments: protogen-python
|
||||||
$(MAKE) -C backend/python/autogptq
|
|
||||||
$(MAKE) -C backend/python/bark
|
$(MAKE) -C backend/python/bark
|
||||||
$(MAKE) -C backend/python/coqui
|
$(MAKE) -C backend/python/coqui
|
||||||
$(MAKE) -C backend/python/diffusers
|
$(MAKE) -C backend/python/diffusers
|
||||||
|
@ -611,10 +642,12 @@ prepare-extra-conda-environments: protogen-python
|
||||||
prepare-test-extra: protogen-python
|
prepare-test-extra: protogen-python
|
||||||
$(MAKE) -C backend/python/transformers
|
$(MAKE) -C backend/python/transformers
|
||||||
$(MAKE) -C backend/python/diffusers
|
$(MAKE) -C backend/python/diffusers
|
||||||
|
$(MAKE) -C backend/python/vllm
|
||||||
|
|
||||||
test-extra: prepare-test-extra
|
test-extra: prepare-test-extra
|
||||||
$(MAKE) -C backend/python/transformers test
|
$(MAKE) -C backend/python/transformers test
|
||||||
$(MAKE) -C backend/python/diffusers test
|
$(MAKE) -C backend/python/diffusers test
|
||||||
|
$(MAKE) -C backend/python/vllm test
|
||||||
|
|
||||||
backend-assets:
|
backend-assets:
|
||||||
mkdir -p backend-assets
|
mkdir -p backend-assets
|
||||||
|
@ -756,8 +789,8 @@ ifneq ($(UPX),)
|
||||||
$(UPX) backend-assets/grpc/silero-vad
|
$(UPX) backend-assets/grpc/silero-vad
|
||||||
endif
|
endif
|
||||||
|
|
||||||
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
|
backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
|
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
|
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
|
||||||
ifneq ($(UPX),)
|
ifneq ($(UPX),)
|
||||||
$(UPX) backend-assets/grpc/whisper
|
$(UPX) backend-assets/grpc/whisper
|
||||||
|
@ -809,7 +842,8 @@ docker-aio-all:
|
||||||
|
|
||||||
docker-image-intel:
|
docker-image-intel:
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
--progress plain \
|
||||||
|
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
|
||||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||||
--build-arg GO_TAGS="none" \
|
--build-arg GO_TAGS="none" \
|
||||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||||
|
@ -817,7 +851,7 @@ docker-image-intel:
|
||||||
|
|
||||||
docker-image-intel-xpu:
|
docker-image-intel-xpu:
|
||||||
docker build \
|
docker build \
|
||||||
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
|
--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
|
||||||
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
|
||||||
--build-arg GO_TAGS="none" \
|
--build-arg GO_TAGS="none" \
|
||||||
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
|
||||||
|
|
172
README.md
172
README.md
|
@ -1,7 +1,6 @@
|
||||||
<h1 align="center">
|
<h1 align="center">
|
||||||
<br>
|
<br>
|
||||||
<img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
|
<img height="300" src="./core/http/static/logo.png"> <br>
|
||||||
LocalAI
|
|
||||||
<br>
|
<br>
|
||||||
</h1>
|
</h1>
|
||||||
|
|
||||||
|
@ -31,7 +30,7 @@
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://twitter.com/LocalAI_API" target="blank">
|
<a href="https://twitter.com/LocalAI_API" target="blank">
|
||||||
<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
|
<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
|
||||||
</a>
|
</a>
|
||||||
<a href="https://discord.gg/uJAeKSAGDy" target="blank">
|
<a href="https://discord.gg/uJAeKSAGDy" target="blank">
|
||||||
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
|
<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
|
||||||
|
@ -44,35 +43,154 @@
|
||||||
|
|
||||||
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
|
> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
|
||||||
>
|
>
|
||||||
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples)
|
> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on
|
||||||
|
[](https://t.me/localaiofficial_bot)
|
||||||
|
|
||||||
[](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[](https://artifacthub.io/packages/search?repo=localai)
|
[](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[](https://artifacthub.io/packages/search?repo=localai)
|
||||||
|
|
||||||
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
|
**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
|
||||||
|
|
||||||

|
|
||||||
|
## 📚🆕 Local Stack Family
|
||||||
|
|
||||||
|
🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<td width="50%" valign="top">
|
||||||
|
<a href="https://github.com/mudler/LocalAGI">
|
||||||
|
<img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td width="50%" valign="top">
|
||||||
|
<h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
|
||||||
|
<p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td width="50%" valign="top">
|
||||||
|
<a href="https://github.com/mudler/LocalRecall">
|
||||||
|
<img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
|
||||||
|
</a>
|
||||||
|
</td>
|
||||||
|
<td width="50%" valign="top">
|
||||||
|
<h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
|
||||||
|
<p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
## Screenshots
|
||||||
|
|
||||||
|
|
||||||
|
| Talk Interface | Generate Audio |
|
||||||
|
| --- | --- |
|
||||||
|
|  |  |
|
||||||
|
|
||||||
|
| Models Overview | Generate Images |
|
||||||
|
| --- | --- |
|
||||||
|
|  |  |
|
||||||
|
|
||||||
|
| Chat Interface | Home |
|
||||||
|
| --- | --- |
|
||||||
|
|  |  |
|
||||||
|
|
||||||
|
| Login | Swarm |
|
||||||
|
| --- | --- |
|
||||||
|
| |  |
|
||||||
|
|
||||||
|
## 💻 Quickstart
|
||||||
|
|
||||||
Run the installer script:
|
Run the installer script:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Basic installation
|
||||||
curl https://localai.io/install.sh | sh
|
curl https://localai.io/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Or run with docker:
|
For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
|
||||||
```bash
|
|
||||||
# CPU only image:
|
|
||||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
|
|
||||||
|
|
||||||
# Nvidia GPU:
|
Or run with docker:
|
||||||
|
|
||||||
|
### CPU only image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### NVIDIA GPU Images:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CUDA 12.0 with core features
|
||||||
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
|
||||||
|
|
||||||
# CPU and GPU image (bigger size):
|
# CUDA 12.0 with extra Python dependencies
|
||||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12-extras
|
||||||
|
|
||||||
# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
|
# CUDA 11.7 with core features
|
||||||
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
|
||||||
|
|
||||||
|
# CUDA 11.7 with extra Python dependencies
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11-extras
|
||||||
|
|
||||||
|
# NVIDIA Jetson (L4T) ARM64
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### AMD GPU Images (ROCm):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# ROCm with core features
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
|
||||||
|
|
||||||
|
# ROCm with extra Python dependencies
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas-extras
|
||||||
|
```
|
||||||
|
|
||||||
|
### Intel GPU Images (oneAPI):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Intel GPU with FP16 support
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
|
||||||
|
|
||||||
|
# Intel GPU with FP16 support and extra dependencies
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16-extras
|
||||||
|
|
||||||
|
# Intel GPU with FP32 support
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
|
||||||
|
|
||||||
|
# Intel GPU with FP32 support and extra dependencies
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32-extras
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vulkan GPU Images:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Vulkan with core features
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
|
||||||
|
```
|
||||||
|
|
||||||
|
### AIO Images (pre-downloaded models):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CPU version
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||||
|
|
||||||
|
# NVIDIA CUDA 12 version
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
|
||||||
|
|
||||||
|
# NVIDIA CUDA 11 version
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
|
||||||
|
|
||||||
|
# Intel GPU version
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
|
||||||
|
|
||||||
|
# AMD GPU version
|
||||||
|
docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
|
||||||
|
```
|
||||||
|
|
||||||
|
For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
|
||||||
|
|
||||||
To load models:
|
To load models:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -88,10 +206,13 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
|
||||||
local-ai run oci://localai/phi-2:latest
|
local-ai run oci://localai/phi-2:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||||
|
|
||||||
## 📰 Latest project news
|
## 📰 Latest project news
|
||||||
|
|
||||||
|
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
|
||||||
|
- Apr 2025: WebUI overhaul, AIO images updates
|
||||||
|
- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
|
||||||
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
|
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
|
||||||
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
|
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
|
||||||
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
|
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
|
||||||
|
@ -105,19 +226,6 @@ local-ai run oci://localai/phi-2:latest
|
||||||
|
|
||||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||||
|
|
||||||
## 🔥🔥 Hot topics (looking for help):
|
|
||||||
|
|
||||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
|
||||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
|
||||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
|
||||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
|
||||||
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
|
|
||||||
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
|
|
||||||
- Vulkan: https://github.com/mudler/LocalAI/issues/1647
|
|
||||||
- Anthropic API: https://github.com/mudler/LocalAI/issues/1808
|
|
||||||
|
|
||||||
If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
|
|
||||||
|
|
||||||
## 🚀 [Features](https://localai.io/features/)
|
## 🚀 [Features](https://localai.io/features/)
|
||||||
|
|
||||||
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
|
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
|
||||||
|
@ -131,12 +239,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
|
||||||
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
|
- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
|
||||||
- 📈 [Reranker API](https://localai.io/features/reranker/)
|
- 📈 [Reranker API](https://localai.io/features/reranker/)
|
||||||
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
|
- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
|
||||||
|
- [Agentic capabilities](https://github.com/mudler/LocalAGI)
|
||||||
- 🔊 Voice activity detection (Silero-VAD support)
|
- 🔊 Voice activity detection (Silero-VAD support)
|
||||||
- 🌍 Integrated WebUI!
|
- 🌍 Integrated WebUI!
|
||||||
|
|
||||||
## 💻 Usage
|
|
||||||
|
|
||||||
Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
|
|
||||||
|
|
||||||
### 🔗 Community and integrations
|
### 🔗 Community and integrations
|
||||||
|
|
||||||
|
@ -212,7 +318,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://www.spectrocloud.com/" target="blank">
|
<a href="https://www.spectrocloud.com/" target="blank">
|
||||||
<img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
|
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://www.premai.io/" target="blank">
|
<a href="https://www.premai.io/" target="blank">
|
||||||
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
|
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
name: text-embedding-ada-002
|
|
||||||
embeddings: true
|
embeddings: true
|
||||||
|
name: text-embedding-ada-002
|
||||||
parameters:
|
parameters:
|
||||||
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
|
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||||
|
|
||||||
usage: |
|
usage: |
|
||||||
You can test this model with curl like this:
|
You can test this model with curl like this:
|
||||||
|
|
|
@ -1,101 +1,57 @@
|
||||||
name: gpt-4
|
|
||||||
mmap: true
|
|
||||||
parameters:
|
|
||||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
|
||||||
context_size: 8192
|
context_size: 8192
|
||||||
|
f16: true
|
||||||
stopwords:
|
|
||||||
- "<|im_end|>"
|
|
||||||
- "<dummy32000>"
|
|
||||||
- "</tool_call>"
|
|
||||||
- "<|eot_id|>"
|
|
||||||
- "<|end_of_text|>"
|
|
||||||
|
|
||||||
function:
|
function:
|
||||||
# disable injecting the "answer" tool
|
|
||||||
disable_no_action: true
|
|
||||||
|
|
||||||
grammar:
|
grammar:
|
||||||
# This allows the grammar to also return messages
|
no_mixed_free_string: true
|
||||||
mixed_mode: true
|
schema_type: llama3.1 # or JSON is supported too (json)
|
||||||
# Suffix to add to the grammar
|
response_regex:
|
||||||
#prefix: '<tool_call>\n'
|
- <function=(?P<name>\w+)>(?P<arguments>.*)</function>
|
||||||
# Force parallel calls in the grammar
|
mmap: true
|
||||||
# parallel_calls: true
|
name: gpt-4
|
||||||
|
parameters:
|
||||||
return_name_in_function_response: true
|
model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||||
# Without grammar uncomment the lines below
|
stopwords:
|
||||||
# Warning: this is relying only on the capability of the
|
- <|im_end|>
|
||||||
# LLM model to generate the correct function call.
|
- <dummy32000>
|
||||||
json_regex_match:
|
- <|eot_id|>
|
||||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
- <|end_of_text|>
|
||||||
- "(?s)<tool_call>(.*?)"
|
|
||||||
replace_llm_results:
|
|
||||||
# Drop the scratchpad content from responses
|
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
|
||||||
replace_function_results:
|
|
||||||
# Replace everything that is not JSON array or object
|
|
||||||
#
|
|
||||||
- key: '(?s)^[^{\[]*'
|
|
||||||
value: ""
|
|
||||||
- key: '(?s)[^}\]]*$'
|
|
||||||
value: ""
|
|
||||||
- key: "'([^']*?)'"
|
|
||||||
value: "_DQUOTE_${1}_DQUOTE_"
|
|
||||||
- key: '\\"'
|
|
||||||
value: "__TEMP_QUOTE__"
|
|
||||||
- key: "\'"
|
|
||||||
value: "'"
|
|
||||||
- key: "_DQUOTE_"
|
|
||||||
value: '"'
|
|
||||||
- key: "__TEMP_QUOTE__"
|
|
||||||
value: '"'
|
|
||||||
# Drop the scratchpad content from responses
|
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
|
||||||
|
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
{{.Input -}}
|
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
||||||
<|im_start|>assistant
|
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||||
|
{{.Input }}
|
||||||
|
<|start_header_id|>assistant<|end_header_id|>
|
||||||
chat_message: |
|
chat_message: |
|
||||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
|
||||||
{{- if .FunctionCall }}
|
{{ if .FunctionCall -}}
|
||||||
<tool_call>
|
{{ else if eq .RoleName "tool" -}}
|
||||||
{{- else if eq .RoleName "tool" }}
|
The Function was executed and the response was:
|
||||||
<tool_response>
|
{{ end -}}
|
||||||
{{- end }}
|
{{ if .Content -}}
|
||||||
{{- if .Content}}
|
{{.Content -}}
|
||||||
{{.Content }}
|
{{ else if .FunctionCall -}}
|
||||||
{{- end }}
|
{{ range .FunctionCall }}
|
||||||
{{- if .FunctionCall}}
|
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
|
||||||
{{toJson .FunctionCall}}
|
{{ end }}
|
||||||
{{- end }}
|
{{ end -}}
|
||||||
{{- if .FunctionCall }}
|
<|eot_id|>
|
||||||
</tool_call>
|
|
||||||
{{- else if eq .RoleName "tool" }}
|
|
||||||
</tool_response>
|
|
||||||
{{- end }}<|im_end|>
|
|
||||||
completion: |
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
function: |-
|
function: |
|
||||||
<|im_start|>system
|
<|start_header_id|>system<|end_header_id|>
|
||||||
You are a function calling AI model.
|
You are an expert in composing functions. You are given a question and a set of possible functions.
|
||||||
Here are the available tools:
|
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
||||||
<tools>
|
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
|
||||||
{{range .Functions}}
|
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
|
||||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
|
||||||
{{end}}
|
You SHOULD NOT include any other text in the response.
|
||||||
</tools>
|
Here is a list of functions in JSON format that you can invoke.
|
||||||
You should call the tools provided to you sequentially
|
{{toJson .Functions}}
|
||||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
||||||
<scratchpad>
|
{{.Input}}
|
||||||
{step-by-step reasoning and plan in bullet points}
|
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||||
</scratchpad>
|
|
||||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
download_files:
|
||||||
<tool_call>
|
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||||
{"arguments": <args-dict>, "name": <function-name>}
|
sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
|
||||||
</tool_call><|im_end|>
|
uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
|
||||||
{{.Input -}}
|
|
||||||
<|im_start|>assistant
|
|
|
@ -1,31 +1,49 @@
|
||||||
backend: llama-cpp
|
|
||||||
context_size: 4096
|
context_size: 4096
|
||||||
f16: true
|
f16: true
|
||||||
mmap: true
|
mmap: true
|
||||||
|
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
name: gpt-4o
|
name: gpt-4o
|
||||||
|
|
||||||
roles:
|
|
||||||
user: "USER:"
|
|
||||||
assistant: "ASSISTANT:"
|
|
||||||
system: "SYSTEM:"
|
|
||||||
|
|
||||||
mmproj: bakllava-mmproj.gguf
|
|
||||||
parameters:
|
parameters:
|
||||||
model: bakllava.gguf
|
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
|
stopwords:
|
||||||
|
- <|im_end|>
|
||||||
|
- <dummy32000>
|
||||||
|
- </s>
|
||||||
|
- <|endoftext|>
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
chat_message: |
|
||||||
|
<|im_start|>{{ .RoleName }}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
Function call:
|
||||||
|
{{ else if eq .RoleName "tool" -}}
|
||||||
|
Function response:
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .Content -}}
|
||||||
|
{{.Content }}
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
{{toJson .FunctionCall}}
|
||||||
|
{{ end -}}<|im_end|>
|
||||||
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
ASSISTANT:
|
function: |
|
||||||
|
<|im_start|>system
|
||||||
|
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||||
|
{{range .Functions}}
|
||||||
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
|
{{end}}
|
||||||
|
For each function call return a json object with function name and arguments
|
||||||
|
<|im_end|>
|
||||||
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
|
||||||
download_files:
|
download_files:
|
||||||
- filename: bakllava.gguf
|
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
|
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||||
- filename: bakllava-mmproj.gguf
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||||
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
|
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||||
usage: |
|
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
|
||||||
"model": "gpt-4-vision-preview",
|
|
||||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
embeddings: true
|
||||||
name: text-embedding-ada-002
|
name: text-embedding-ada-002
|
||||||
backend: sentencetransformers
|
|
||||||
parameters:
|
parameters:
|
||||||
model: all-MiniLM-L6-v2
|
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||||
|
|
||||||
usage: |
|
usage: |
|
||||||
You can test this model with curl like this:
|
You can test this model with curl like this:
|
||||||
|
|
|
@ -1,101 +1,53 @@
|
||||||
name: gpt-4
|
context_size: 4096
|
||||||
mmap: true
|
f16: true
|
||||||
parameters:
|
|
||||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
|
||||||
context_size: 8192
|
|
||||||
|
|
||||||
stopwords:
|
|
||||||
- "<|im_end|>"
|
|
||||||
- "<dummy32000>"
|
|
||||||
- "</tool_call>"
|
|
||||||
- "<|eot_id|>"
|
|
||||||
- "<|end_of_text|>"
|
|
||||||
|
|
||||||
function:
|
function:
|
||||||
# disable injecting the "answer" tool
|
capture_llm_results:
|
||||||
disable_no_action: true
|
- (?s)<Thought>(.*?)</Thought>
|
||||||
|
|
||||||
grammar:
|
grammar:
|
||||||
# This allows the grammar to also return messages
|
properties_order: name,arguments
|
||||||
mixed_mode: true
|
json_regex_match:
|
||||||
# Suffix to add to the grammar
|
- (?s)<Output>(.*?)</Output>
|
||||||
#prefix: '<tool_call>\n'
|
|
||||||
# Force parallel calls in the grammar
|
|
||||||
# parallel_calls: true
|
|
||||||
|
|
||||||
return_name_in_function_response: true
|
|
||||||
# Without grammar uncomment the lines below
|
|
||||||
# Warning: this is relying only on the capability of the
|
|
||||||
# LLM model to generate the correct function call.
|
|
||||||
json_regex_match:
|
|
||||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
|
||||||
- "(?s)<tool_call>(.*?)"
|
|
||||||
replace_llm_results:
|
replace_llm_results:
|
||||||
# Drop the scratchpad content from responses
|
- key: (?s)<Thought>(.*?)</Thought>
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
value: ""
|
||||||
replace_function_results:
|
mmap: true
|
||||||
# Replace everything that is not JSON array or object
|
name: gpt-4
|
||||||
#
|
parameters:
|
||||||
- key: '(?s)^[^{\[]*'
|
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||||
value: ""
|
stopwords:
|
||||||
- key: '(?s)[^}\]]*$'
|
- <|im_end|>
|
||||||
value: ""
|
- <dummy32000>
|
||||||
- key: "'([^']*?)'"
|
- </s>
|
||||||
value: "_DQUOTE_${1}_DQUOTE_"
|
|
||||||
- key: '\\"'
|
|
||||||
value: "__TEMP_QUOTE__"
|
|
||||||
- key: "\'"
|
|
||||||
value: "'"
|
|
||||||
- key: "_DQUOTE_"
|
|
||||||
value: '"'
|
|
||||||
- key: "__TEMP_QUOTE__"
|
|
||||||
value: '"'
|
|
||||||
# Drop the scratchpad content from responses
|
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
|
||||||
|
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
{{.Input -}}
|
{{.Input -}}
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
chat_message: |
|
chat_message: |
|
||||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
<|im_start|>{{ .RoleName }}
|
||||||
{{- if .FunctionCall }}
|
{{ if .FunctionCall -}}
|
||||||
<tool_call>
|
Function call:
|
||||||
{{- else if eq .RoleName "tool" }}
|
{{ else if eq .RoleName "tool" -}}
|
||||||
<tool_response>
|
Function response:
|
||||||
{{- end }}
|
{{ end -}}
|
||||||
{{- if .Content}}
|
{{ if .Content -}}
|
||||||
{{.Content }}
|
{{.Content }}
|
||||||
{{- end }}
|
{{ end -}}
|
||||||
{{- if .FunctionCall}}
|
{{ if .FunctionCall -}}
|
||||||
{{toJson .FunctionCall}}
|
{{toJson .FunctionCall}}
|
||||||
{{- end }}
|
{{ end -}}<|im_end|>
|
||||||
{{- if .FunctionCall }}
|
|
||||||
</tool_call>
|
|
||||||
{{- else if eq .RoleName "tool" }}
|
|
||||||
</tool_response>
|
|
||||||
{{- end }}<|im_end|>
|
|
||||||
completion: |
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
function: |-
|
function: |
|
||||||
<|im_start|>system
|
<|im_start|>system
|
||||||
You are a function calling AI model.
|
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||||
Here are the available tools:
|
|
||||||
<tools>
|
|
||||||
{{range .Functions}}
|
{{range .Functions}}
|
||||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
{{end}}
|
{{end}}
|
||||||
</tools>
|
<|im_end|>
|
||||||
You should call the tools provided to you sequentially
|
|
||||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
|
||||||
<scratchpad>
|
|
||||||
{step-by-step reasoning and plan in bullet points}
|
|
||||||
</scratchpad>
|
|
||||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
|
||||||
<tool_call>
|
|
||||||
{"arguments": <args-dict>, "name": <function-name>}
|
|
||||||
</tool_call><|im_end|>
|
|
||||||
{{.Input -}}
|
{{.Input -}}
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
|
|
||||||
|
download_files:
|
||||||
|
- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||||
|
sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
|
||||||
|
uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||||
|
|
|
@ -1,35 +1,49 @@
|
||||||
backend: llama-cpp
|
|
||||||
context_size: 4096
|
context_size: 4096
|
||||||
f16: true
|
f16: true
|
||||||
mmap: true
|
mmap: true
|
||||||
|
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
name: gpt-4o
|
name: gpt-4o
|
||||||
|
|
||||||
roles:
|
|
||||||
user: "USER:"
|
|
||||||
assistant: "ASSISTANT:"
|
|
||||||
system: "SYSTEM:"
|
|
||||||
|
|
||||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
|
||||||
parameters:
|
parameters:
|
||||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
temperature: 0.2
|
stopwords:
|
||||||
top_k: 40
|
- <|im_end|>
|
||||||
top_p: 0.95
|
- <dummy32000>
|
||||||
seed: -1
|
- </s>
|
||||||
|
- <|endoftext|>
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
chat_message: |
|
||||||
|
<|im_start|>{{ .RoleName }}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
Function call:
|
||||||
|
{{ else if eq .RoleName "tool" -}}
|
||||||
|
Function response:
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .Content -}}
|
||||||
|
{{.Content }}
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
{{toJson .FunctionCall}}
|
||||||
|
{{ end -}}<|im_end|>
|
||||||
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
ASSISTANT:
|
function: |
|
||||||
|
<|im_start|>system
|
||||||
|
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||||
|
{{range .Functions}}
|
||||||
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
|
{{end}}
|
||||||
|
For each function call return a json object with function name and arguments
|
||||||
|
<|im_end|>
|
||||||
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
|
||||||
download_files:
|
download_files:
|
||||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||||
usage: |
|
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
|
||||||
"model": "gpt-4-vision-preview",
|
|
||||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
embeddings: true
|
||||||
name: text-embedding-ada-002
|
name: text-embedding-ada-002
|
||||||
backend: sentencetransformers
|
|
||||||
parameters:
|
parameters:
|
||||||
model: all-MiniLM-L6-v2
|
model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
|
||||||
|
|
||||||
usage: |
|
usage: |
|
||||||
You can test this model with curl like this:
|
You can test this model with curl like this:
|
||||||
|
|
|
@ -1,103 +1,53 @@
|
||||||
name: gpt-4
|
context_size: 4096
|
||||||
mmap: false
|
f16: true
|
||||||
context_size: 8192
|
|
||||||
|
|
||||||
f16: false
|
|
||||||
parameters:
|
|
||||||
model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
|
|
||||||
|
|
||||||
stopwords:
|
|
||||||
- "<|im_end|>"
|
|
||||||
- "<dummy32000>"
|
|
||||||
- "</tool_call>"
|
|
||||||
- "<|eot_id|>"
|
|
||||||
- "<|end_of_text|>"
|
|
||||||
|
|
||||||
function:
|
function:
|
||||||
# disable injecting the "answer" tool
|
capture_llm_results:
|
||||||
disable_no_action: true
|
- (?s)<Thought>(.*?)</Thought>
|
||||||
|
|
||||||
grammar:
|
grammar:
|
||||||
# This allows the grammar to also return messages
|
properties_order: name,arguments
|
||||||
mixed_mode: true
|
json_regex_match:
|
||||||
# Suffix to add to the grammar
|
- (?s)<Output>(.*?)</Output>
|
||||||
#prefix: '<tool_call>\n'
|
|
||||||
# Force parallel calls in the grammar
|
|
||||||
# parallel_calls: true
|
|
||||||
|
|
||||||
return_name_in_function_response: true
|
|
||||||
# Without grammar uncomment the lines below
|
|
||||||
# Warning: this is relying only on the capability of the
|
|
||||||
# LLM model to generate the correct function call.
|
|
||||||
json_regex_match:
|
|
||||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
|
||||||
- "(?s)<tool_call>(.*?)"
|
|
||||||
replace_llm_results:
|
replace_llm_results:
|
||||||
# Drop the scratchpad content from responses
|
- key: (?s)<Thought>(.*?)</Thought>
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
value: ""
|
||||||
replace_function_results:
|
mmap: true
|
||||||
# Replace everything that is not JSON array or object
|
name: gpt-4
|
||||||
#
|
parameters:
|
||||||
- key: '(?s)^[^{\[]*'
|
model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
|
||||||
value: ""
|
stopwords:
|
||||||
- key: '(?s)[^}\]]*$'
|
- <|im_end|>
|
||||||
value: ""
|
- <dummy32000>
|
||||||
- key: "'([^']*?)'"
|
- </s>
|
||||||
value: "_DQUOTE_${1}_DQUOTE_"
|
|
||||||
- key: '\\"'
|
|
||||||
value: "__TEMP_QUOTE__"
|
|
||||||
- key: "\'"
|
|
||||||
value: "'"
|
|
||||||
- key: "_DQUOTE_"
|
|
||||||
value: '"'
|
|
||||||
- key: "__TEMP_QUOTE__"
|
|
||||||
value: '"'
|
|
||||||
# Drop the scratchpad content from responses
|
|
||||||
- key: "(?s)<scratchpad>.*</scratchpad>"
|
|
||||||
value: ""
|
|
||||||
|
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
{{.Input -}}
|
{{.Input -}}
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
chat_message: |
|
chat_message: |
|
||||||
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
|
<|im_start|>{{ .RoleName }}
|
||||||
{{- if .FunctionCall }}
|
{{ if .FunctionCall -}}
|
||||||
<tool_call>
|
Function call:
|
||||||
{{- else if eq .RoleName "tool" }}
|
{{ else if eq .RoleName "tool" -}}
|
||||||
<tool_response>
|
Function response:
|
||||||
{{- end }}
|
{{ end -}}
|
||||||
{{- if .Content}}
|
{{ if .Content -}}
|
||||||
{{.Content }}
|
{{.Content }}
|
||||||
{{- end }}
|
{{ end -}}
|
||||||
{{- if .FunctionCall}}
|
{{ if .FunctionCall -}}
|
||||||
{{toJson .FunctionCall}}
|
{{toJson .FunctionCall}}
|
||||||
{{- end }}
|
{{ end -}}<|im_end|>
|
||||||
{{- if .FunctionCall }}
|
|
||||||
</tool_call>
|
|
||||||
{{- else if eq .RoleName "tool" }}
|
|
||||||
</tool_response>
|
|
||||||
{{- end }}<|im_end|>
|
|
||||||
completion: |
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
function: |-
|
function: |
|
||||||
<|im_start|>system
|
<|im_start|>system
|
||||||
You are a function calling AI model.
|
You are an AI assistant that executes function calls, and these are the tools at your disposal:
|
||||||
Here are the available tools:
|
|
||||||
<tools>
|
|
||||||
{{range .Functions}}
|
{{range .Functions}}
|
||||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
{{end}}
|
{{end}}
|
||||||
</tools>
|
<|im_end|>
|
||||||
You should call the tools provided to you sequentially
|
|
||||||
Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
|
|
||||||
<scratchpad>
|
|
||||||
{step-by-step reasoning and plan in bullet points}
|
|
||||||
</scratchpad>
|
|
||||||
For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
|
|
||||||
<tool_call>
|
|
||||||
{"arguments": <args-dict>, "name": <function-name>}
|
|
||||||
</tool_call><|im_end|>
|
|
||||||
{{.Input -}}
|
{{.Input -}}
|
||||||
<|im_start|>assistant
|
<|im_start|>assistant
|
||||||
|
|
||||||
|
download_files:
|
||||||
|
- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
||||||
|
sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
|
||||||
|
uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
|
|
@ -1,35 +1,50 @@
|
||||||
backend: llama-cpp
|
|
||||||
context_size: 4096
|
context_size: 4096
|
||||||
mmap: false
|
f16: true
|
||||||
f16: false
|
mmap: true
|
||||||
|
mmproj: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
name: gpt-4o
|
name: gpt-4o
|
||||||
|
|
||||||
roles:
|
|
||||||
user: "USER:"
|
|
||||||
assistant: "ASSISTANT:"
|
|
||||||
system: "SYSTEM:"
|
|
||||||
|
|
||||||
mmproj: llava-v1.6-7b-mmproj-f16.gguf
|
|
||||||
parameters:
|
parameters:
|
||||||
model: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
model: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
temperature: 0.2
|
stopwords:
|
||||||
top_k: 40
|
- <|im_end|>
|
||||||
top_p: 0.95
|
- <dummy32000>
|
||||||
seed: -1
|
- </s>
|
||||||
|
- <|endoftext|>
|
||||||
template:
|
template:
|
||||||
chat: |
|
chat: |
|
||||||
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
chat_message: |
|
||||||
|
<|im_start|>{{ .RoleName }}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
Function call:
|
||||||
|
{{ else if eq .RoleName "tool" -}}
|
||||||
|
Function response:
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .Content -}}
|
||||||
|
{{.Content }}
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
{{toJson .FunctionCall}}
|
||||||
|
{{ end -}}<|im_end|>
|
||||||
|
completion: |
|
||||||
{{.Input}}
|
{{.Input}}
|
||||||
ASSISTANT:
|
function: |
|
||||||
|
<|im_start|>system
|
||||||
|
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||||
|
{{range .Functions}}
|
||||||
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
|
{{end}}
|
||||||
|
For each function call return a json object with function name and arguments
|
||||||
|
<|im_end|>
|
||||||
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant
|
||||||
|
|
||||||
|
|
||||||
download_files:
|
download_files:
|
||||||
- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
|
- filename: minicpm-v-2_6-Q4_K_M.gguf
|
||||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
|
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
|
||||||
- filename: llava-v1.6-7b-mmproj-f16.gguf
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
|
||||||
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
|
- filename: minicpm-v-2_6-mmproj-f16.gguf
|
||||||
|
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
|
||||||
usage: |
|
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
|
||||||
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
|
|
||||||
"model": "gpt-4-vision-preview",
|
|
||||||
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
|
|
15
assets.go
15
assets.go
|
@ -1,6 +1,15 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import "embed"
|
import (
|
||||||
|
rice "github.com/GeertJohan/go.rice"
|
||||||
|
)
|
||||||
|
|
||||||
//go:embed backend-assets/*
|
var backendAssets *rice.Box
|
||||||
var backendAssets embed.FS
|
|
||||||
|
func init() {
|
||||||
|
var err error
|
||||||
|
backendAssets, err = rice.FindBox("backend-assets")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -14,6 +14,7 @@ service Backend {
|
||||||
rpc PredictStream(PredictOptions) returns (stream Reply) {}
|
rpc PredictStream(PredictOptions) returns (stream Reply) {}
|
||||||
rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
|
rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
|
||||||
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
|
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
|
||||||
|
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
|
||||||
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
|
||||||
rpc TTS(TTSRequest) returns (Result) {}
|
rpc TTS(TTSRequest) returns (Result) {}
|
||||||
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
|
||||||
|
@ -165,7 +166,6 @@ message Reply {
|
||||||
|
|
||||||
message GrammarTrigger {
|
message GrammarTrigger {
|
||||||
string word = 1;
|
string word = 1;
|
||||||
bool at_start = 2;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
message ModelOptions {
|
message ModelOptions {
|
||||||
|
@ -191,11 +191,7 @@ message ModelOptions {
|
||||||
int32 NGQA = 20;
|
int32 NGQA = 20;
|
||||||
string ModelFile = 21;
|
string ModelFile = 21;
|
||||||
|
|
||||||
// AutoGPTQ
|
|
||||||
string Device = 22;
|
|
||||||
bool UseTriton = 23;
|
|
||||||
string ModelBaseName = 24;
|
|
||||||
bool UseFastTokenizer = 25;
|
|
||||||
|
|
||||||
// Diffusers
|
// Diffusers
|
||||||
string PipelineType = 26;
|
string PipelineType = 26;
|
||||||
|
@ -229,6 +225,11 @@ message ModelOptions {
|
||||||
int32 MaxModelLen = 54;
|
int32 MaxModelLen = 54;
|
||||||
int32 TensorParallelSize = 55;
|
int32 TensorParallelSize = 55;
|
||||||
string LoadFormat = 58;
|
string LoadFormat = 58;
|
||||||
|
bool DisableLogStatus = 66;
|
||||||
|
string DType = 67;
|
||||||
|
int32 LimitImagePerPrompt = 68;
|
||||||
|
int32 LimitVideoPerPrompt = 69;
|
||||||
|
int32 LimitAudioPerPrompt = 70;
|
||||||
|
|
||||||
string MMProj = 41;
|
string MMProj = 41;
|
||||||
|
|
||||||
|
@ -301,6 +302,19 @@ message GenerateImageRequest {
|
||||||
int32 CLIPSkip = 11;
|
int32 CLIPSkip = 11;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message GenerateVideoRequest {
|
||||||
|
string prompt = 1;
|
||||||
|
string start_image = 2; // Path or base64 encoded image for the start frame
|
||||||
|
string end_image = 3; // Path or base64 encoded image for the end frame
|
||||||
|
int32 width = 4;
|
||||||
|
int32 height = 5;
|
||||||
|
int32 num_frames = 6; // Number of frames to generate
|
||||||
|
int32 fps = 7; // Frames per second
|
||||||
|
int32 seed = 8;
|
||||||
|
float cfg_scale = 9; // Classifier-free guidance scale
|
||||||
|
string dst = 10; // Output path for the generated video
|
||||||
|
}
|
||||||
|
|
||||||
message TTSRequest {
|
message TTSRequest {
|
||||||
string text = 1;
|
string text = 1;
|
||||||
string model = 2;
|
string model = 2;
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
|
|
||||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||||
## This is an hack for now, but it should be fixed in the future.
|
## This is an hack for now, but it should be fixed in the future.
|
||||||
set(TARGET myclip)
|
# set(TARGET myclip)
|
||||||
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
|
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
|
||||||
install(TARGETS ${TARGET} LIBRARY)
|
# install(TARGETS ${TARGET} LIBRARY)
|
||||||
target_include_directories(myclip PUBLIC .)
|
# target_include_directories(myclip PUBLIC .)
|
||||||
target_include_directories(myclip PUBLIC ../..)
|
# target_include_directories(myclip PUBLIC ../..)
|
||||||
target_include_directories(myclip PUBLIC ../../common)
|
# target_include_directories(myclip PUBLIC ../../common)
|
||||||
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
# target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
if (NOT MSVC)
|
# if (NOT MSVC)
|
||||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||||
endif()
|
# endif()
|
||||||
# END CLIP hack
|
# END CLIP hack
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,8 +74,12 @@ add_library(hw_grpc_proto
|
||||||
${hw_proto_srcs}
|
${hw_proto_srcs}
|
||||||
${hw_proto_hdrs} )
|
${hw_proto_hdrs} )
|
||||||
|
|
||||||
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
|
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
|
||||||
|
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||||
|
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||||
|
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||||
absl::flags_parse
|
absl::flags_parse
|
||||||
gRPC::${_REFLECTION}
|
gRPC::${_REFLECTION}
|
||||||
gRPC::${_GRPC_GRPCPP}
|
gRPC::${_GRPC_GRPCPP}
|
||||||
|
|
|
@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||||
TARGET?=--target grpc-server
|
TARGET?=--target grpc-server
|
||||||
|
|
||||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
|
||||||
|
|
||||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||||
ifeq ($(BUILD_TYPE),cublas)
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
|
@ -36,11 +36,18 @@ else ifeq ($(OS),Darwin)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),sycl_f16)
|
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||||
|
-DCMAKE_C_COMPILER=icx \
|
||||||
|
-DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
-DCMAKE_CXX_FLAGS="-fsycl" \
|
||||||
|
-DGGML_SYCL_F16=ON
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BUILD_TYPE),sycl_f32)
|
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||||
CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||||
|
-DCMAKE_C_COMPILER=icx \
|
||||||
|
-DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
-DCMAKE_CXX_FLAGS="-fsycl"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
llama.cpp:
|
llama.cpp:
|
||||||
|
@ -52,8 +59,8 @@ llama.cpp:
|
||||||
git checkout -b build $(LLAMA_VERSION) && \
|
git checkout -b build $(LLAMA_VERSION) && \
|
||||||
git submodule update --init --recursive --depth 1 --single-branch
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
|
|
||||||
llama.cpp/examples/grpc-server: llama.cpp
|
llama.cpp/tools/grpc-server: llama.cpp
|
||||||
mkdir -p llama.cpp/examples/grpc-server
|
mkdir -p llama.cpp/tools/grpc-server
|
||||||
bash prepare.sh
|
bash prepare.sh
|
||||||
|
|
||||||
rebuild:
|
rebuild:
|
||||||
|
@ -63,13 +70,13 @@ rebuild:
|
||||||
|
|
||||||
purge:
|
purge:
|
||||||
rm -rf llama.cpp/build
|
rm -rf llama.cpp/build
|
||||||
rm -rf llama.cpp/examples/grpc-server
|
rm -rf llama.cpp/tools/grpc-server
|
||||||
rm -rf grpc-server
|
rm -rf grpc-server
|
||||||
|
|
||||||
clean: purge
|
clean: purge
|
||||||
rm -rf llama.cpp
|
rm -rf llama.cpp
|
||||||
|
|
||||||
grpc-server: llama.cpp llama.cpp/examples/grpc-server
|
grpc-server: llama.cpp llama.cpp/tools/grpc-server
|
||||||
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
|
||||||
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||||
+bash -c "source $(ONEAPI_VARS); \
|
+bash -c "source $(ONEAPI_VARS); \
|
||||||
|
@ -77,4 +84,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||||
else
|
else
|
||||||
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
|
+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
|
||||||
endif
|
endif
|
||||||
cp llama.cpp/build/bin/grpc-server .
|
cp llama.cpp/build/bin/grpc-server .
|
||||||
|
|
File diff suppressed because it is too large
Load diff
24596
backend/cpp/llama/json.hpp
vendored
24596
backend/cpp/llama/json.hpp
vendored
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
||||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||||
index 3cd0d2fa..6c5e811a 100644
|
index 3cd0d2fa..6c5e811a 100644
|
||||||
--- a/examples/llava/clip.cpp
|
--- a/tools/mtmd/clip.cpp
|
||||||
+++ b/examples/llava/clip.cpp
|
+++ b/tools/mtmd/clip.cpp
|
||||||
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||||
|
|
|
@ -7,21 +7,46 @@ for patch in $(ls patches); do
|
||||||
patch -d llama.cpp/ -p1 < patches/$patch
|
patch -d llama.cpp/ -p1 < patches/$patch
|
||||||
done
|
done
|
||||||
|
|
||||||
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
|
set -e
|
||||||
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
|
|
||||||
cp -rfv json.hpp llama.cpp/examples/grpc-server/
|
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
|
||||||
cp -rfv utils.hpp llama.cpp/examples/grpc-server/
|
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
|
||||||
|
cp -rfv llama.cpp/common/json.hpp llama.cpp/tools/grpc-server/
|
||||||
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
|
cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
|
||||||
|
cp -rfv llama.cpp/tools/server/httplib.h llama.cpp/tools/grpc-server/
|
||||||
|
|
||||||
|
set +e
|
||||||
|
if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
|
||||||
echo "grpc-server already added"
|
echo "grpc-server already added"
|
||||||
else
|
else
|
||||||
echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
|
echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
|
||||||
fi
|
fi
|
||||||
|
set -e
|
||||||
|
|
||||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
|
||||||
## This is an hack for now, but it should be fixed in the future.
|
# and remove the main function
|
||||||
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
|
# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
|
||||||
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
|
awk '
|
||||||
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
|
/int[ \t]+main[ \t]*\(/ { # If the line starts the main function
|
||||||
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
|
in_main=1; # Set a flag
|
||||||
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
|
open_braces=0; # Track number of open braces
|
||||||
|
}
|
||||||
|
in_main {
|
||||||
|
open_braces += gsub(/\{/, "{"); # Count opening braces
|
||||||
|
open_braces -= gsub(/\}/, "}"); # Count closing braces
|
||||||
|
if (open_braces == 0) { # If all braces are closed
|
||||||
|
in_main=0; # End skipping
|
||||||
|
}
|
||||||
|
next; # Skip lines inside main
|
||||||
|
}
|
||||||
|
!in_main # Print lines not inside main
|
||||||
|
' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
|
||||||
|
|
||||||
|
# remove index.html.gz.hpp and loading.html.hpp includes
|
||||||
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||||
|
# macOS
|
||||||
|
sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
|
||||||
|
else
|
||||||
|
# Linux and others
|
||||||
|
sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
|
||||||
|
fi
|
483
backend/cpp/llama/utils.hpp
vendored
483
backend/cpp/llama/utils.hpp
vendored
|
@ -1,483 +0,0 @@
|
||||||
// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <set>
|
|
||||||
#include <mutex>
|
|
||||||
#include <condition_variable>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
#include "json.hpp"
|
|
||||||
|
|
||||||
#include "../llava/clip.h"
|
|
||||||
|
|
||||||
using json = nlohmann::json;
|
|
||||||
|
|
||||||
extern bool server_verbose;
|
|
||||||
|
|
||||||
#ifndef SERVER_VERBOSE
|
|
||||||
#define SERVER_VERBOSE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if SERVER_VERBOSE != 1
|
|
||||||
#define LOG_VERBOSE(MSG, ...)
|
|
||||||
#else
|
|
||||||
#define LOG_VERBOSE(MSG, ...) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
if (server_verbose) \
|
|
||||||
{ \
|
|
||||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
|
|
||||||
//
|
|
||||||
// parallel
|
|
||||||
//
|
|
||||||
|
|
||||||
enum server_state {
|
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
|
||||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
|
||||||
SERVER_STATE_ERROR // An error occurred, load_model failed
|
|
||||||
};
|
|
||||||
|
|
||||||
enum task_type {
|
|
||||||
TASK_TYPE_COMPLETION,
|
|
||||||
TASK_TYPE_CANCEL,
|
|
||||||
TASK_TYPE_NEXT_RESPONSE
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_server {
|
|
||||||
int id = -1; // to be filled by llama_server_queue
|
|
||||||
int target_id;
|
|
||||||
task_type type;
|
|
||||||
json data;
|
|
||||||
bool infill_mode = false;
|
|
||||||
bool embedding_mode = false;
|
|
||||||
int multitask_id = -1;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_result {
|
|
||||||
int id;
|
|
||||||
int multitask_id = -1;
|
|
||||||
bool stop;
|
|
||||||
bool error;
|
|
||||||
json result_json;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct task_multi {
|
|
||||||
int id;
|
|
||||||
std::set<int> subtasks_remaining{};
|
|
||||||
std::vector<task_result> results{};
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO: can become bool if we can't find use of more states
|
|
||||||
enum slot_state
|
|
||||||
{
|
|
||||||
IDLE,
|
|
||||||
PROCESSING,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum slot_command
|
|
||||||
{
|
|
||||||
NONE,
|
|
||||||
LOAD_PROMPT,
|
|
||||||
RELEASE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct slot_params
|
|
||||||
{
|
|
||||||
bool stream = true;
|
|
||||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
|
||||||
|
|
||||||
uint32_t seed = -1; // RNG seed
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
|
||||||
|
|
||||||
json input_prefix;
|
|
||||||
json input_suffix;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct slot_image
|
|
||||||
{
|
|
||||||
int32_t id;
|
|
||||||
|
|
||||||
bool request_encode_image = false;
|
|
||||||
float * image_embedding = nullptr;
|
|
||||||
int32_t image_tokens = 0;
|
|
||||||
|
|
||||||
clip_image_u8 * img_data;
|
|
||||||
|
|
||||||
std::string prefix_prompt; // before of this image
|
|
||||||
};
|
|
||||||
|
|
||||||
// completion token output with probabilities
|
|
||||||
struct completion_token_output
|
|
||||||
{
|
|
||||||
struct token_prob
|
|
||||||
{
|
|
||||||
llama_token tok;
|
|
||||||
float prob;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<token_prob> probs;
|
|
||||||
llama_token tok;
|
|
||||||
std::string text_to_send;
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line,
|
|
||||||
const char *message, const nlohmann::ordered_json &extra)
|
|
||||||
{
|
|
||||||
nlohmann::ordered_json log
|
|
||||||
{
|
|
||||||
{"timestamp", time(nullptr)},
|
|
||||||
{"level", level},
|
|
||||||
{"function", function},
|
|
||||||
{"line", line},
|
|
||||||
{"message", message},
|
|
||||||
};
|
|
||||||
|
|
||||||
if (!extra.empty())
|
|
||||||
{
|
|
||||||
log.merge_patch(extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
|
||||||
printf("%.*s\n", (int)str.size(), str.data());
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// server utils
|
|
||||||
//
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
|
||||||
{
|
|
||||||
// Fallback null to default value
|
|
||||||
return body.contains(key) && !body.at(key).is_null()
|
|
||||||
? body.value(key, default_value)
|
|
||||||
: default_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string format_chatml(std::vector<json> messages)
|
|
||||||
{
|
|
||||||
std::ostringstream chatml_msgs;
|
|
||||||
|
|
||||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
|
||||||
chatml_msgs << "<|im_start|>"
|
|
||||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
|
||||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
|
||||||
<< "<|im_end|>\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
|
||||||
|
|
||||||
return chatml_msgs.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// work queue utils
|
|
||||||
//
|
|
||||||
|
|
||||||
struct llama_server_queue {
|
|
||||||
int id = 0;
|
|
||||||
std::mutex mutex_tasks;
|
|
||||||
// queues
|
|
||||||
std::vector<task_server> queue_tasks;
|
|
||||||
std::vector<task_server> queue_tasks_deferred;
|
|
||||||
std::vector<task_multi> queue_multitasks;
|
|
||||||
std::condition_variable condition_tasks;
|
|
||||||
// callback functions
|
|
||||||
std::function<void(task_server&)> callback_new_task;
|
|
||||||
std::function<void(task_multi&)> callback_finish_multitask;
|
|
||||||
std::function<void(void)> callback_all_task_finished;
|
|
||||||
|
|
||||||
// Add a new task to the end of the queue
|
|
||||||
int post(task_server task) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (task.id == -1) {
|
|
||||||
task.id = id++;
|
|
||||||
}
|
|
||||||
queue_tasks.push_back(std::move(task));
|
|
||||||
condition_tasks.notify_one();
|
|
||||||
return task.id;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a new task, but defer until one slot is available
|
|
||||||
void defer(task_server task) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
queue_tasks_deferred.push_back(std::move(task));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the next id for creating anew task
|
|
||||||
int get_new_id() {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
return id++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register function to process a new task
|
|
||||||
void on_new_task(std::function<void(task_server&)> callback) {
|
|
||||||
callback_new_task = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register function to process a multitask
|
|
||||||
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
|
||||||
callback_finish_multitask = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register the function to be called when the batch of tasks is finished
|
|
||||||
void on_all_tasks_finished(std::function<void(void)> callback) {
|
|
||||||
callback_all_task_finished = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Call when the state of one slot is changed
|
|
||||||
void notify_slot_changed() {
|
|
||||||
// move deferred tasks back to main loop
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
for (auto & task : queue_tasks_deferred) {
|
|
||||||
queue_tasks.push_back(std::move(task));
|
|
||||||
}
|
|
||||||
queue_tasks_deferred.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start the main loop. This call is blocking
|
|
||||||
[[noreturn]]
|
|
||||||
void start_loop() {
|
|
||||||
while (true) {
|
|
||||||
// new task arrived
|
|
||||||
LOG_VERBOSE("have new task", {});
|
|
||||||
{
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (queue_tasks.empty()) {
|
|
||||||
lock.unlock();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
task_server task = queue_tasks.front();
|
|
||||||
queue_tasks.erase(queue_tasks.begin());
|
|
||||||
lock.unlock();
|
|
||||||
LOG_VERBOSE("callback_new_task", {});
|
|
||||||
callback_new_task(task);
|
|
||||||
}
|
|
||||||
LOG_VERBOSE("callback_all_task_finished", {});
|
|
||||||
// process and update all the multitasks
|
|
||||||
auto queue_iterator = queue_multitasks.begin();
|
|
||||||
while (queue_iterator != queue_multitasks.end())
|
|
||||||
{
|
|
||||||
if (queue_iterator->subtasks_remaining.empty())
|
|
||||||
{
|
|
||||||
// all subtasks done == multitask is done
|
|
||||||
task_multi current_multitask = *queue_iterator;
|
|
||||||
callback_finish_multitask(current_multitask);
|
|
||||||
// remove this multitask
|
|
||||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
++queue_iterator;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// all tasks in the current loop is finished
|
|
||||||
callback_all_task_finished();
|
|
||||||
}
|
|
||||||
LOG_VERBOSE("wait for new task", {});
|
|
||||||
// wait for new task
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
|
||||||
if (queue_tasks.empty()) {
|
|
||||||
condition_tasks.wait(lock, [&]{
|
|
||||||
return !queue_tasks.empty();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
|
||||||
// functions to manage multitasks
|
|
||||||
//
|
|
||||||
|
|
||||||
// add a multitask by specifying the id of all subtask (subtask is a task_server)
|
|
||||||
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
|
||||||
task_multi multi;
|
|
||||||
multi.id = multitask_id;
|
|
||||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
|
||||||
queue_multitasks.push_back(multi);
|
|
||||||
}
|
|
||||||
|
|
||||||
// updatethe remaining subtasks, while appending results to multitask
|
|
||||||
void update_multitask(int multitask_id, int subtask_id, task_result& result)
|
|
||||||
{
|
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
|
||||||
for (auto& multitask : queue_multitasks)
|
|
||||||
{
|
|
||||||
if (multitask.id == multitask_id)
|
|
||||||
{
|
|
||||||
multitask.subtasks_remaining.erase(subtask_id);
|
|
||||||
multitask.results.push_back(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_server_response {
|
|
||||||
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
|
||||||
callback_multitask_t callback_update_multitask;
|
|
||||||
// for keeping track of all tasks waiting for the result
|
|
||||||
std::set<int> waiting_task_ids;
|
|
||||||
// the main result queue
|
|
||||||
std::vector<task_result> queue_results;
|
|
||||||
std::mutex mutex_results;
|
|
||||||
std::condition_variable condition_results;
|
|
||||||
|
|
||||||
void add_waiting_task_id(int task_id) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
waiting_task_ids.insert(task_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_waiting_task_id(int task_id) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
waiting_task_ids.erase(task_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This function blocks the thread until there is a response for this task_id
|
|
||||||
task_result recv(int task_id) {
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
condition_results.wait(lock, [&]{
|
|
||||||
return !queue_results.empty();
|
|
||||||
});
|
|
||||||
LOG_VERBOSE("condition_results unblock", {});
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
|
||||||
{
|
|
||||||
if (queue_results[i].id == task_id)
|
|
||||||
{
|
|
||||||
assert(queue_results[i].multitask_id == -1);
|
|
||||||
task_result res = queue_results[i];
|
|
||||||
queue_results.erase(queue_results.begin() + i);
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// should never reach here
|
|
||||||
}
|
|
||||||
|
|
||||||
// Register the function to update multitask
|
|
||||||
void on_multitask_update(callback_multitask_t callback) {
|
|
||||||
callback_update_multitask = callback;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send a new result to a waiting task_id
|
|
||||||
void send(task_result result) {
|
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
|
||||||
LOG_VERBOSE("send new result", {});
|
|
||||||
for (auto& task_id : waiting_task_ids) {
|
|
||||||
// LOG_TEE("waiting task id %i \n", task_id);
|
|
||||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
|
||||||
if (result.multitask_id == task_id)
|
|
||||||
{
|
|
||||||
LOG_VERBOSE("callback_update_multitask", {});
|
|
||||||
callback_update_multitask(task_id, result.id, result);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.id == task_id)
|
|
||||||
{
|
|
||||||
LOG_VERBOSE("queue_results.push_back", {});
|
|
||||||
queue_results.push_back(result);
|
|
||||||
condition_results.notify_one();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
//
|
|
||||||
// base64 utils (TODO: move to common in the future)
|
|
||||||
//
|
|
||||||
|
|
||||||
static const std::string base64_chars =
|
|
||||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
"abcdefghijklmnopqrstuvwxyz"
|
|
||||||
"0123456789+/";
|
|
||||||
|
|
||||||
static inline bool is_base64(uint8_t c)
|
|
||||||
{
|
|
||||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
|
||||||
{
|
|
||||||
int i = 0;
|
|
||||||
int j = 0;
|
|
||||||
int in_ = 0;
|
|
||||||
|
|
||||||
int in_len = encoded_string.size();
|
|
||||||
|
|
||||||
uint8_t char_array_4[4];
|
|
||||||
uint8_t char_array_3[3];
|
|
||||||
|
|
||||||
std::vector<uint8_t> ret;
|
|
||||||
|
|
||||||
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
|
|
||||||
{
|
|
||||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
|
||||||
if (i == 4)
|
|
||||||
{
|
|
||||||
for (i = 0; i <4; i++)
|
|
||||||
{
|
|
||||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
||||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
||||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
||||||
|
|
||||||
for (i = 0; (i < 3); i++)
|
|
||||||
{
|
|
||||||
ret.push_back(char_array_3[i]);
|
|
||||||
}
|
|
||||||
i = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i)
|
|
||||||
{
|
|
||||||
for (j = i; j <4; j++)
|
|
||||||
{
|
|
||||||
char_array_4[j] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (j = 0; j <4; j++)
|
|
||||||
{
|
|
||||||
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
|
||||||
}
|
|
||||||
|
|
||||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
|
||||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
|
||||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
|
||||||
|
|
||||||
for (j = 0; (j < i - 1); j++)
|
|
||||||
{
|
|
||||||
ret.push_back(char_array_3[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
|
@ -48,7 +48,7 @@ int tts(char *text,int threads, char *dst ) {
|
||||||
|
|
||||||
// generate audio
|
// generate audio
|
||||||
if (!bark_generate_audio(c, text, threads)) {
|
if (!bark_generate_audio(c, text, threads)) {
|
||||||
fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
|
fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,12 +8,19 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
|
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
|
||||||
|
|
||||||
|
GOCMD?=go
|
||||||
|
CGO_LDFLAGS?=
|
||||||
|
# Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
|
||||||
|
CGO_LDFLAGS_SYCL=
|
||||||
|
GO_TAGS?=
|
||||||
|
LD_FLAGS?=
|
||||||
|
|
||||||
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|
||||||
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
|
||||||
ifeq ($(BUILD_TYPE),cublas)
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
CMAKE_ARGS+=-DGGML_CUDA=ON
|
CMAKE_ARGS+=-DSD_CUDA=ON
|
||||||
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
# to CMAKE_ARGS automatically
|
# to CMAKE_ARGS automatically
|
||||||
else ifeq ($(BUILD_TYPE),openblas)
|
else ifeq ($(BUILD_TYPE),openblas)
|
||||||
|
@ -21,31 +28,50 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||||
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||||
else ifeq ($(BUILD_TYPE),clblas)
|
else ifeq ($(BUILD_TYPE),clblas)
|
||||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||||
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
|
||||||
else ifeq ($(BUILD_TYPE),hipblas)
|
else ifeq ($(BUILD_TYPE),hipblas)
|
||||||
CMAKE_ARGS+=-DGGML_HIP=ON
|
CMAKE_ARGS+=-DSD_HIPBLAS=ON
|
||||||
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
|
||||||
# But if it's OSX without metal, disable it here
|
# But if it's OSX without metal, disable it here
|
||||||
else ifeq ($(OS),Darwin)
|
else ifeq ($(OS),Darwin)
|
||||||
ifneq ($(BUILD_TYPE),metal)
|
ifneq ($(BUILD_TYPE),metal)
|
||||||
CMAKE_ARGS+=-DGGML_METAL=OFF
|
CMAKE_ARGS+=-DSD_METAL=OFF
|
||||||
else
|
else
|
||||||
CMAKE_ARGS+=-DGGML_METAL=ON
|
CMAKE_ARGS+=-DSD_METAL=ON
|
||||||
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
|
||||||
TARGET+=--target ggml-metal
|
TARGET+=--target ggml-metal
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# ifeq ($(BUILD_TYPE),sycl_f16)
|
ifeq ($(BUILD_TYPE),sycl_f16)
|
||||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
|
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||||
# endif
|
-DCMAKE_C_COMPILER=icx \
|
||||||
|
-DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
-DSD_SYCL=ON \
|
||||||
|
-DGGML_SYCL_F16=ON
|
||||||
|
CC=icx
|
||||||
|
CXX=icpx
|
||||||
|
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
|
||||||
|
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
|
||||||
|
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
|
||||||
|
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
|
||||||
|
endif
|
||||||
|
|
||||||
# ifeq ($(BUILD_TYPE),sycl_f32)
|
ifeq ($(BUILD_TYPE),sycl_f32)
|
||||||
# CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
|
CMAKE_ARGS+=-DGGML_SYCL=ON \
|
||||||
# endif
|
-DCMAKE_C_COMPILER=icx \
|
||||||
|
-DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
-DSD_SYCL=ON
|
||||||
|
CC=icx
|
||||||
|
CXX=icpx
|
||||||
|
CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
|
||||||
|
CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
|
||||||
|
CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
|
||||||
|
CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
|
||||||
|
endif
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||||
|
|
||||||
# Find all .a archives in ARCHIVE_DIR
|
# Find all .a archives in ARCHIVE_DIR
|
||||||
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
|
# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
|
||||||
|
@ -86,11 +112,24 @@ endif
|
||||||
$(MAKE) $(COMBINED_LIB)
|
$(MAKE) $(COMBINED_LIB)
|
||||||
|
|
||||||
gosd.o:
|
gosd.o:
|
||||||
|
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
|
||||||
|
+bash -c "source $(ONEAPI_VARS); \
|
||||||
|
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
|
||||||
|
else
|
||||||
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
|
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
|
||||||
|
endif
|
||||||
|
|
||||||
libsd.a: gosd.o
|
libsd.a: gosd.o
|
||||||
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
|
cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
|
||||||
$(AR) rcs libsd.a gosd.o
|
$(AR) rcs libsd.a gosd.o
|
||||||
|
|
||||||
|
stablediffusion-ggml:
|
||||||
|
CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
|
||||||
|
CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
|
||||||
|
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
|
||||||
|
ifneq ($(UPX),)
|
||||||
|
$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
|
||||||
|
endif
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
|
rm -rf gosd.o libsd.a build $(COMBINED_LIB)
|
||||||
|
|
|
@ -35,6 +35,8 @@ const char* sample_method_str[] = {
|
||||||
"ipndm",
|
"ipndm",
|
||||||
"ipndm_v",
|
"ipndm_v",
|
||||||
"lcm",
|
"lcm",
|
||||||
|
"ddim_trailing",
|
||||||
|
"tcd",
|
||||||
};
|
};
|
||||||
|
|
||||||
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
|
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
|
||||||
|
@ -173,6 +175,7 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
|
||||||
-1, //clip_skip
|
-1, //clip_skip
|
||||||
cfg_scale, // sfg_scale
|
cfg_scale, // sfg_scale
|
||||||
3.5f,
|
3.5f,
|
||||||
|
0, // eta
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
sample_method,
|
sample_method,
|
||||||
|
|
|
@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
|
||||||
context.SetTranslate(true)
|
context.SetTranslate(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := context.Process(data, nil, nil); err != nil {
|
if err := context.Process(data, nil, nil, nil); err != nil {
|
||||||
return pb.TranscriptResult{}, err
|
return pb.TranscriptResult{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
.PHONY: autogptq
|
|
||||||
autogptq: protogen
|
|
||||||
bash install.sh
|
|
||||||
|
|
||||||
.PHONY: protogen
|
|
||||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
|
||||||
|
|
||||||
.PHONY: protogen-clean
|
|
||||||
protogen-clean:
|
|
||||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
|
||||||
|
|
||||||
backend_pb2_grpc.py backend_pb2.py:
|
|
||||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
|
||||||
|
|
||||||
.PHONY: clean
|
|
||||||
clean: protogen-clean
|
|
||||||
rm -rf venv __pycache__
|
|
|
@ -1,5 +0,0 @@
|
||||||
# Creating a separate environment for the autogptq project
|
|
||||||
|
|
||||||
```
|
|
||||||
make autogptq
|
|
||||||
```
|
|
|
@ -1,153 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from concurrent import futures
|
|
||||||
import argparse
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import base64
|
|
||||||
|
|
||||||
import grpc
|
|
||||||
import backend_pb2
|
|
||||||
import backend_pb2_grpc
|
|
||||||
|
|
||||||
from auto_gptq import AutoGPTQForCausalLM
|
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
||||||
from transformers import TextGenerationPipeline
|
|
||||||
|
|
||||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
|
||||||
|
|
||||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
|
||||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
|
||||||
|
|
||||||
# Implement the BackendServicer class with the service methods
|
|
||||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|
||||||
def Health(self, request, context):
|
|
||||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
|
||||||
def LoadModel(self, request, context):
|
|
||||||
try:
|
|
||||||
device = "cuda:0"
|
|
||||||
if request.Device != "":
|
|
||||||
device = request.Device
|
|
||||||
|
|
||||||
# support loading local model files
|
|
||||||
model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
|
|
||||||
|
|
||||||
# support model `Qwen/Qwen-VL-Chat-Int4`
|
|
||||||
if "qwen-vl" in request.Model.lower():
|
|
||||||
self.model_name = "Qwen-VL-Chat"
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(model_path,
|
|
||||||
trust_remote_code=request.TrustRemoteCode,
|
|
||||||
device_map="auto").eval()
|
|
||||||
else:
|
|
||||||
model = AutoGPTQForCausalLM.from_quantized(model_path,
|
|
||||||
model_basename=request.ModelBaseName,
|
|
||||||
use_safetensors=True,
|
|
||||||
trust_remote_code=request.TrustRemoteCode,
|
|
||||||
device=device,
|
|
||||||
use_triton=request.UseTriton,
|
|
||||||
quantize_config=None)
|
|
||||||
|
|
||||||
self.model = model
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
except Exception as err:
|
|
||||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
|
||||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
|
||||||
|
|
||||||
def Predict(self, request, context):
|
|
||||||
penalty = 1.0
|
|
||||||
if request.Penalty != 0.0:
|
|
||||||
penalty = request.Penalty
|
|
||||||
tokens = 512
|
|
||||||
if request.Tokens != 0:
|
|
||||||
tokens = request.Tokens
|
|
||||||
top_p = 0.95
|
|
||||||
if request.TopP != 0.0:
|
|
||||||
top_p = request.TopP
|
|
||||||
|
|
||||||
|
|
||||||
prompt_images = self.recompile_vl_prompt(request)
|
|
||||||
compiled_prompt = prompt_images[0]
|
|
||||||
print(f"Prompt: {compiled_prompt}", file=sys.stderr)
|
|
||||||
|
|
||||||
# Implement Predict RPC
|
|
||||||
pipeline = TextGenerationPipeline(
|
|
||||||
model=self.model,
|
|
||||||
tokenizer=self.tokenizer,
|
|
||||||
max_new_tokens=tokens,
|
|
||||||
temperature=request.Temperature,
|
|
||||||
top_p=top_p,
|
|
||||||
repetition_penalty=penalty,
|
|
||||||
)
|
|
||||||
t = pipeline(compiled_prompt)[0]["generated_text"]
|
|
||||||
print(f"generated_text: {t}", file=sys.stderr)
|
|
||||||
|
|
||||||
if compiled_prompt in t:
|
|
||||||
t = t.replace(compiled_prompt, "")
|
|
||||||
# house keeping. Remove the image files from /tmp folder
|
|
||||||
for img_path in prompt_images[1]:
|
|
||||||
try:
|
|
||||||
os.remove(img_path)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
|
||||||
|
|
||||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
|
||||||
|
|
||||||
def PredictStream(self, request, context):
|
|
||||||
# Implement PredictStream RPC
|
|
||||||
#for reply in some_data_generator():
|
|
||||||
# yield reply
|
|
||||||
# Not implemented yet
|
|
||||||
return self.Predict(request, context)
|
|
||||||
|
|
||||||
def recompile_vl_prompt(self, request):
|
|
||||||
prompt = request.Prompt
|
|
||||||
image_paths = []
|
|
||||||
|
|
||||||
if "qwen-vl" in self.model_name.lower():
|
|
||||||
# request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
|
|
||||||
# Then, save the image file paths to an array "image_paths".
|
|
||||||
# read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
|
|
||||||
for i, img in enumerate(request.Images):
|
|
||||||
timestamp = str(int(time.time() * 1000)) # Generate timestamp
|
|
||||||
img_path = f"/tmp/vl-{timestamp}.jpg" # Use timestamp in filename
|
|
||||||
with open(img_path, "wb") as f:
|
|
||||||
f.write(base64.b64decode(img))
|
|
||||||
image_paths.append(img_path)
|
|
||||||
prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
|
|
||||||
else:
|
|
||||||
prompt = request.Prompt
|
|
||||||
return (prompt, image_paths)
|
|
||||||
|
|
||||||
def serve(address):
|
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
|
||||||
server.add_insecure_port(address)
|
|
||||||
server.start()
|
|
||||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
|
||||||
|
|
||||||
# Define the signal handler function
|
|
||||||
def signal_handler(sig, frame):
|
|
||||||
print("Received termination signal. Shutting down...")
|
|
||||||
server.stop(0)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# Set the signal handlers for SIGINT and SIGTERM
|
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
|
||||||
signal.signal(signal.SIGTERM, signal_handler)
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
server.stop(0)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
serve(args.addr)
|
|
|
@ -1,14 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
|
|
||||||
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
|
|
||||||
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
|
|
||||||
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
|
|
||||||
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
|
||||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
|
||||||
fi
|
|
||||||
|
|
||||||
installRequirements
|
|
|
@ -1,2 +0,0 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
|
||||||
torch==2.4.1+cu118
|
|
|
@ -1 +0,0 @@
|
||||||
torch==2.4.1
|
|
|
@ -1,2 +0,0 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
|
||||||
torch==2.4.1+rocm6.0
|
|
|
@ -1,6 +0,0 @@
|
||||||
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
|
||||||
intel-extension-for-pytorch==2.3.110+xpu
|
|
||||||
torch==2.3.1+cxx11.abi
|
|
||||||
oneccl_bind_pt==2.3.100+xpu
|
|
||||||
optimum[openvino]
|
|
||||||
setuptools
|
|
|
@ -1,6 +0,0 @@
|
||||||
accelerate
|
|
||||||
auto-gptq==0.7.1
|
|
||||||
grpcio==1.70.0
|
|
||||||
protobuf
|
|
||||||
certifi
|
|
||||||
transformers
|
|
|
@ -1,4 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
startBackend $@
|
|
|
@ -1,6 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
runUnittests
|
|
|
@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
return backend_pb2.Result(success=True)
|
return backend_pb2.Result(success=True)
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
bark==0.1.5
|
bark==0.1.5
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
|
@ -1,3 +1,3 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
grpcio-tools
|
grpcio-tools
|
|
@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
return backend_pb2.Result(success=True)
|
return backend_pb2.Result(success=True)
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
transformers
|
transformers==4.48.3
|
||||||
accelerate
|
accelerate
|
||||||
torch==2.4.1
|
torch==2.4.1
|
||||||
coqui-tts
|
coqui-tts
|
|
@ -1,6 +1,6 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
torch==2.4.1+cu118
|
torch==2.4.1+cu118
|
||||||
torchaudio==2.4.1+cu118
|
torchaudio==2.4.1+cu118
|
||||||
transformers
|
transformers==4.48.3
|
||||||
accelerate
|
accelerate
|
||||||
coqui-tts
|
coqui-tts
|
|
@ -1,5 +1,5 @@
|
||||||
torch==2.4.1
|
torch==2.4.1
|
||||||
torchaudio==2.4.1
|
torchaudio==2.4.1
|
||||||
transformers
|
transformers==4.48.3
|
||||||
accelerate
|
accelerate
|
||||||
coqui-tts
|
coqui-tts
|
|
@ -1,6 +1,6 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||||
torch==2.4.1+rocm6.0
|
torch==2.4.1+rocm6.0
|
||||||
torchaudio==2.4.1+rocm6.0
|
torchaudio==2.4.1+rocm6.0
|
||||||
transformers
|
transformers==4.48.3
|
||||||
accelerate
|
accelerate
|
||||||
coqui-tts
|
coqui-tts
|
|
@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
|
||||||
oneccl_bind_pt==2.3.100+xpu
|
oneccl_bind_pt==2.3.100+xpu
|
||||||
optimum[openvino]
|
optimum[openvino]
|
||||||
setuptools
|
setuptools
|
||||||
transformers
|
transformers==4.48.3
|
||||||
accelerate
|
accelerate
|
||||||
coqui-tts
|
coqui-tts
|
|
@ -1,4 +1,4 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
||||||
packaging==24.1
|
packaging==24.1
|
|
@ -19,7 +19,7 @@ import grpc
|
||||||
|
|
||||||
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
|
||||||
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
|
EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
|
||||||
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
|
from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
|
||||||
from diffusers.pipelines.stable_diffusion import safety_checker
|
from diffusers.pipelines.stable_diffusion import safety_checker
|
||||||
from diffusers.utils import load_image, export_to_video
|
from diffusers.utils import load_image, export_to_video
|
||||||
from compel import Compel, ReturnedEmbeddingsType
|
from compel import Compel, ReturnedEmbeddingsType
|
||||||
|
@ -168,9 +168,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
# We are storing all the options in a dict so we can use it later when
|
# We are storing all the options in a dict so we can use it later when
|
||||||
# generating the images
|
# generating the images
|
||||||
for opt in options:
|
for opt in options:
|
||||||
|
if ":" not in opt:
|
||||||
|
continue
|
||||||
key, value = opt.split(":")
|
key, value = opt.split(":")
|
||||||
self.options[key] = value
|
self.options[key] = value
|
||||||
|
|
||||||
|
print(f"Options: {self.options}", file=sys.stderr)
|
||||||
|
|
||||||
local = False
|
local = False
|
||||||
modelFile = request.Model
|
modelFile = request.Model
|
||||||
|
|
||||||
|
@ -287,6 +291,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
if request.LowVRAM:
|
if request.LowVRAM:
|
||||||
self.pipe.enable_model_cpu_offload()
|
self.pipe.enable_model_cpu_offload()
|
||||||
|
elif request.PipelineType == "Lumina2Text2ImgPipeline":
|
||||||
|
self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
|
||||||
|
request.Model,
|
||||||
|
torch_dtype=torch.bfloat16)
|
||||||
|
if request.LowVRAM:
|
||||||
|
self.pipe.enable_model_cpu_offload()
|
||||||
elif request.PipelineType == "SanaPipeline":
|
elif request.PipelineType == "SanaPipeline":
|
||||||
self.pipe = SanaPipeline.from_pretrained(
|
self.pipe = SanaPipeline.from_pretrained(
|
||||||
request.Model,
|
request.Model,
|
||||||
|
@ -516,7 +526,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
setuptools
|
setuptools
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
pillow
|
pillow
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
||||||
|
|
|
@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
||||||
wheel
|
wheel
|
||||||
|
|
|
@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
|
return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
grpcio-tools
|
grpcio-tools
|
|
@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
return backend_pb2.Result(success=True)
|
return backend_pb2.Result(success=True)
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
phonemizer
|
phonemizer
|
||||||
scipy
|
scipy
|
||||||
|
|
|
@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
return backend_pb2.RerankResult(usage=usage, results=results)
|
return backend_pb2.RerankResult(usage=usage, results=results)
|
||||||
|
|
||||||
def serve(address):
|
def serve(address):
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
server.add_insecure_port(address)
|
server.add_insecure_port(address)
|
||||||
server.start()
|
server.start()
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
|
@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
async def serve(address):
|
async def serve(address):
|
||||||
# Start asyncio gRPC server
|
# Start asyncio gRPC server
|
||||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
# Add the servicer to the server
|
# Add the servicer to the server
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
# Bind the server to the address
|
# Bind the server to the address
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
||||||
setuptools
|
setuptools
|
||||||
|
|
|
@ -109,6 +109,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
engine_args.swap_space = request.SwapSpace
|
engine_args.swap_space = request.SwapSpace
|
||||||
if request.MaxModelLen != 0:
|
if request.MaxModelLen != 0:
|
||||||
engine_args.max_model_len = request.MaxModelLen
|
engine_args.max_model_len = request.MaxModelLen
|
||||||
|
if request.DisableLogStatus:
|
||||||
|
engine_args.disable_log_status = request.DisableLogStatus
|
||||||
|
if request.DType != "":
|
||||||
|
engine_args.dtype = request.DType
|
||||||
|
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
|
||||||
|
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
|
||||||
|
engine_args.limit_mm_per_prompt = {
|
||||||
|
"image": max(request.LimitImagePerPrompt, 1),
|
||||||
|
"video": max(request.LimitVideoPerPrompt, 1),
|
||||||
|
"audio": max(request.LimitAudioPerPrompt, 1)
|
||||||
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||||
|
@ -183,27 +194,40 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
await iterations.aclose()
|
await iterations.aclose()
|
||||||
|
|
||||||
async def _predict(self, request, context, streaming=False):
|
async def _predict(self, request, context, streaming=False):
|
||||||
|
# Build the sampling parameters
|
||||||
|
# NOTE: this must stay in sync with the vllm backend
|
||||||
|
request_to_sampling_params = {
|
||||||
|
"N": "n",
|
||||||
|
"PresencePenalty": "presence_penalty",
|
||||||
|
"FrequencyPenalty": "frequency_penalty",
|
||||||
|
"RepetitionPenalty": "repetition_penalty",
|
||||||
|
"Temperature": "temperature",
|
||||||
|
"TopP": "top_p",
|
||||||
|
"TopK": "top_k",
|
||||||
|
"MinP": "min_p",
|
||||||
|
"Seed": "seed",
|
||||||
|
"StopPrompts": "stop",
|
||||||
|
"StopTokenIds": "stop_token_ids",
|
||||||
|
"BadWords": "bad_words",
|
||||||
|
"IncludeStopStrInOutput": "include_stop_str_in_output",
|
||||||
|
"IgnoreEOS": "ignore_eos",
|
||||||
|
"Tokens": "max_tokens",
|
||||||
|
"MinTokens": "min_tokens",
|
||||||
|
"Logprobs": "logprobs",
|
||||||
|
"PromptLogprobs": "prompt_logprobs",
|
||||||
|
"SkipSpecialTokens": "skip_special_tokens",
|
||||||
|
"SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
|
||||||
|
"TruncatePromptTokens": "truncate_prompt_tokens",
|
||||||
|
"GuidedDecoding": "guided_decoding",
|
||||||
|
}
|
||||||
|
|
||||||
# Build sampling parameters
|
|
||||||
sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
|
sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
|
||||||
if request.TopP != 0:
|
|
||||||
sampling_params.top_p = request.TopP
|
for request_field, param_field in request_to_sampling_params.items():
|
||||||
if request.Tokens > 0:
|
if hasattr(request, request_field):
|
||||||
sampling_params.max_tokens = request.Tokens
|
value = getattr(request, request_field)
|
||||||
if request.Temperature != 0:
|
if value not in (None, 0, [], False):
|
||||||
sampling_params.temperature = request.Temperature
|
setattr(sampling_params, param_field, value)
|
||||||
if request.TopK != 0:
|
|
||||||
sampling_params.top_k = request.TopK
|
|
||||||
if request.PresencePenalty != 0:
|
|
||||||
sampling_params.presence_penalty = request.PresencePenalty
|
|
||||||
if request.FrequencyPenalty != 0:
|
|
||||||
sampling_params.frequency_penalty = request.FrequencyPenalty
|
|
||||||
if request.StopPrompts:
|
|
||||||
sampling_params.stop = request.StopPrompts
|
|
||||||
if request.IgnoreEOS:
|
|
||||||
sampling_params.ignore_eos = request.IgnoreEOS
|
|
||||||
if request.Seed != 0:
|
|
||||||
sampling_params.seed = request.Seed
|
|
||||||
|
|
||||||
# Extract image paths and process images
|
# Extract image paths and process images
|
||||||
prompt = request.Prompt
|
prompt = request.Prompt
|
||||||
|
@ -269,7 +293,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
def load_image(self, image_path: str):
|
def load_image(self, image_path: str):
|
||||||
"""
|
"""
|
||||||
Load an image from the given file path or base64 encoded data.
|
Load an image from the given file path or base64 encoded data.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_path (str): The path to the image file or base64 encoded data.
|
image_path (str): The path to the image file or base64 encoded data.
|
||||||
|
|
||||||
|
@ -288,7 +312,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
def load_video(self, video_path: str):
|
def load_video(self, video_path: str):
|
||||||
"""
|
"""
|
||||||
Load a video from the given file path.
|
Load a video from the given file path.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
video_path (str): The path to the image file.
|
video_path (str): The path to the image file.
|
||||||
|
|
||||||
|
@ -309,7 +333,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||||
|
|
||||||
async def serve(address):
|
async def serve(address):
|
||||||
# Start asyncio gRPC server
|
# Start asyncio gRPC server
|
||||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||||
|
options=[
|
||||||
|
('grpc.max_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB
|
||||||
|
])
|
||||||
# Add the servicer to the server
|
# Add the servicer to the server
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||||
# Bind the server to the address
|
# Bind the server to the address
|
||||||
|
@ -335,4 +364,4 @@ if __name__ == "__main__":
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
asyncio.run(serve(args.addr))
|
asyncio.run(serve(args.addr))
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
grpcio==1.70.0
|
grpcio==1.72.0
|
||||||
protobuf
|
protobuf
|
||||||
certifi
|
certifi
|
||||||
setuptools
|
setuptools
|
|
@ -75,6 +75,53 @@ class TestBackendServicer(unittest.TestCase):
|
||||||
finally:
|
finally:
|
||||||
self.tearDown()
|
self.tearDown()
|
||||||
|
|
||||||
|
def test_sampling_params(self):
|
||||||
|
"""
|
||||||
|
This method tests if all sampling parameters are correctly processed
|
||||||
|
NOTE: this does NOT test for correctness, just that we received a compatible response
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self.setUp()
|
||||||
|
with grpc.insecure_channel("localhost:50051") as channel:
|
||||||
|
stub = backend_pb2_grpc.BackendStub(channel)
|
||||||
|
response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
|
||||||
|
self.assertTrue(response.success)
|
||||||
|
|
||||||
|
req = backend_pb2.PredictOptions(
|
||||||
|
Prompt="The capital of France is",
|
||||||
|
TopP=0.8,
|
||||||
|
Tokens=50,
|
||||||
|
Temperature=0.7,
|
||||||
|
TopK=40,
|
||||||
|
PresencePenalty=0.1,
|
||||||
|
FrequencyPenalty=0.2,
|
||||||
|
RepetitionPenalty=1.1,
|
||||||
|
MinP=0.05,
|
||||||
|
Seed=42,
|
||||||
|
StopPrompts=["\n"],
|
||||||
|
StopTokenIds=[50256],
|
||||||
|
BadWords=["badword"],
|
||||||
|
IncludeStopStrInOutput=True,
|
||||||
|
IgnoreEOS=True,
|
||||||
|
MinTokens=5,
|
||||||
|
Logprobs=5,
|
||||||
|
PromptLogprobs=5,
|
||||||
|
SkipSpecialTokens=True,
|
||||||
|
SpacesBetweenSpecialTokens=True,
|
||||||
|
TruncatePromptTokens=10,
|
||||||
|
GuidedDecoding=True,
|
||||||
|
N=2,
|
||||||
|
)
|
||||||
|
resp = stub.Predict(req)
|
||||||
|
self.assertIsNotNone(resp.message)
|
||||||
|
self.assertIsNotNone(resp.logprobs)
|
||||||
|
except Exception as err:
|
||||||
|
print(err)
|
||||||
|
self.fail("sampling params service failed")
|
||||||
|
finally:
|
||||||
|
self.tearDown()
|
||||||
|
|
||||||
|
|
||||||
def test_embedding(self):
|
def test_embedding(self):
|
||||||
"""
|
"""
|
||||||
This method tests if the embeddings are generated successfully
|
This method tests if the embeddings are generated successfully
|
||||||
|
|
|
@ -16,7 +16,7 @@ type Application struct {
|
||||||
func newApplication(appConfig *config.ApplicationConfig) *Application {
|
func newApplication(appConfig *config.ApplicationConfig) *Application {
|
||||||
return &Application{
|
return &Application{
|
||||||
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
|
backendLoader: config.NewBackendConfigLoader(appConfig.ModelPath),
|
||||||
modelLoader: model.NewModelLoader(appConfig.ModelPath),
|
modelLoader: model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
|
||||||
applicationConfig: appConfig,
|
applicationConfig: appConfig,
|
||||||
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
|
templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,18 +43,12 @@ func New(opts ...config.AppOption) (*Application, error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
return nil, fmt.Errorf("unable to create ModelPath: %q", err)
|
||||||
}
|
}
|
||||||
if options.ImageDir != "" {
|
if options.GeneratedContentDir != "" {
|
||||||
err := os.MkdirAll(options.ImageDir, 0750)
|
err := os.MkdirAll(options.GeneratedContentDir, 0750)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to create ImageDir: %q", err)
|
return nil, fmt.Errorf("unable to create ImageDir: %q", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if options.AudioDir != "" {
|
|
||||||
err := os.MkdirAll(options.AudioDir, 0750)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("unable to create AudioDir: %q", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if options.UploadDir != "" {
|
if options.UploadDir != "" {
|
||||||
err := os.MkdirAll(options.UploadDir, 0750)
|
err := os.MkdirAll(options.UploadDir, 0750)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -143,7 +137,7 @@ func New(opts ...config.AppOption) (*Application, error) {
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
if options.LoadToMemory != nil {
|
if options.LoadToMemory != nil && !options.SingleBackend {
|
||||||
for _, m := range options.LoadToMemory {
|
for _, m := range options.LoadToMemory {
|
||||||
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
|
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -17,6 +17,7 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
var fn func() ([]float32, error)
|
var fn func() ([]float32, error)
|
||||||
switch model := inferenceModel.(type) {
|
switch model := inferenceModel.(type) {
|
||||||
|
|
|
@ -16,6 +16,7 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
fn := func() error {
|
fn := func() error {
|
||||||
_, err := inferenceModel.GenerateImage(
|
_, err := inferenceModel.GenerateImage(
|
||||||
|
|
|
@ -53,6 +53,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
var protoMessages []*proto.Message
|
var protoMessages []*proto.Message
|
||||||
// if we are using the tokenizer template, we need to convert the messages to proto messages
|
// if we are using the tokenizer template, we need to convert the messages to proto messages
|
||||||
|
@ -116,6 +117,11 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||||
}
|
}
|
||||||
|
|
||||||
if tokenCallback != nil {
|
if tokenCallback != nil {
|
||||||
|
|
||||||
|
if c.TemplateConfig.ReplyPrefix != "" {
|
||||||
|
tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
|
||||||
|
}
|
||||||
|
|
||||||
ss := ""
|
ss := ""
|
||||||
|
|
||||||
var partialRune []byte
|
var partialRune []byte
|
||||||
|
@ -165,8 +171,13 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
||||||
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
|
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
|
||||||
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
|
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
|
||||||
|
|
||||||
|
response := string(reply.Message)
|
||||||
|
if c.TemplateConfig.ReplyPrefix != "" {
|
||||||
|
response = c.TemplateConfig.ReplyPrefix + response
|
||||||
|
}
|
||||||
|
|
||||||
return LLMResponse{
|
return LLMResponse{
|
||||||
Response: string(reply.Message),
|
Response: response,
|
||||||
Usage: tokenUsage,
|
Usage: tokenUsage,
|
||||||
}, err
|
}, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,10 +40,6 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
|
||||||
grpcOpts := grpcModelOpts(c)
|
grpcOpts := grpcModelOpts(c)
|
||||||
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
||||||
|
|
||||||
if so.SingleBackend {
|
|
||||||
defOpts = append(defOpts, model.WithSingleActiveBackend())
|
|
||||||
}
|
|
||||||
|
|
||||||
if so.ParallelBackendRequests {
|
if so.ParallelBackendRequests {
|
||||||
defOpts = append(defOpts, model.EnableParallelRequests)
|
defOpts = append(defOpts, model.EnableParallelRequests)
|
||||||
}
|
}
|
||||||
|
@ -103,7 +99,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||||
mmap = *c.MMap
|
mmap = *c.MMap
|
||||||
}
|
}
|
||||||
|
|
||||||
ctxSize := 1024
|
ctxSize := 4096
|
||||||
if c.ContextSize != nil {
|
if c.ContextSize != nil {
|
||||||
ctxSize = *c.ContextSize
|
ctxSize = *c.ContextSize
|
||||||
}
|
}
|
||||||
|
@ -121,8 +117,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||||
triggers := make([]*pb.GrammarTrigger, 0)
|
triggers := make([]*pb.GrammarTrigger, 0)
|
||||||
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
|
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
|
||||||
triggers = append(triggers, &pb.GrammarTrigger{
|
triggers = append(triggers, &pb.GrammarTrigger{
|
||||||
Word: t.Word,
|
Word: t.Word,
|
||||||
AtStart: t.AtStart,
|
|
||||||
})
|
})
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -159,35 +154,36 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||||
SwapSpace: int32(c.SwapSpace),
|
SwapSpace: int32(c.SwapSpace),
|
||||||
MaxModelLen: int32(c.MaxModelLen),
|
MaxModelLen: int32(c.MaxModelLen),
|
||||||
TensorParallelSize: int32(c.TensorParallelSize),
|
TensorParallelSize: int32(c.TensorParallelSize),
|
||||||
MMProj: c.MMProj,
|
DisableLogStatus: c.DisableLogStatus,
|
||||||
FlashAttention: c.FlashAttention,
|
DType: c.DType,
|
||||||
CacheTypeKey: c.CacheTypeK,
|
// LimitMMPerPrompt vLLM
|
||||||
CacheTypeValue: c.CacheTypeV,
|
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
|
||||||
NoKVOffload: c.NoKVOffloading,
|
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
|
||||||
YarnExtFactor: c.YarnExtFactor,
|
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
|
||||||
YarnAttnFactor: c.YarnAttnFactor,
|
MMProj: c.MMProj,
|
||||||
YarnBetaFast: c.YarnBetaFast,
|
FlashAttention: c.FlashAttention,
|
||||||
YarnBetaSlow: c.YarnBetaSlow,
|
CacheTypeKey: c.CacheTypeK,
|
||||||
NGQA: c.NGQA,
|
CacheTypeValue: c.CacheTypeV,
|
||||||
RMSNormEps: c.RMSNormEps,
|
NoKVOffload: c.NoKVOffloading,
|
||||||
MLock: mmlock,
|
YarnExtFactor: c.YarnExtFactor,
|
||||||
RopeFreqBase: c.RopeFreqBase,
|
YarnAttnFactor: c.YarnAttnFactor,
|
||||||
RopeScaling: c.RopeScaling,
|
YarnBetaFast: c.YarnBetaFast,
|
||||||
Type: c.ModelType,
|
YarnBetaSlow: c.YarnBetaSlow,
|
||||||
RopeFreqScale: c.RopeFreqScale,
|
NGQA: c.NGQA,
|
||||||
NUMA: c.NUMA,
|
RMSNormEps: c.RMSNormEps,
|
||||||
Embeddings: embeddings,
|
MLock: mmlock,
|
||||||
LowVRAM: lowVRAM,
|
RopeFreqBase: c.RopeFreqBase,
|
||||||
NGPULayers: int32(nGPULayers),
|
RopeScaling: c.RopeScaling,
|
||||||
MMap: mmap,
|
Type: c.ModelType,
|
||||||
MainGPU: c.MainGPU,
|
RopeFreqScale: c.RopeFreqScale,
|
||||||
Threads: int32(*c.Threads),
|
NUMA: c.NUMA,
|
||||||
TensorSplit: c.TensorSplit,
|
Embeddings: embeddings,
|
||||||
// AutoGPTQ
|
LowVRAM: lowVRAM,
|
||||||
ModelBaseName: c.AutoGPTQ.ModelBaseName,
|
NGPULayers: int32(nGPULayers),
|
||||||
Device: c.AutoGPTQ.Device,
|
MMap: mmap,
|
||||||
UseTriton: c.AutoGPTQ.Triton,
|
MainGPU: c.MainGPU,
|
||||||
UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
|
Threads: int32(*c.Threads),
|
||||||
|
TensorSplit: c.TensorSplit,
|
||||||
// RWKV
|
// RWKV
|
||||||
Tokenizer: c.Tokenizer,
|
Tokenizer: c.Tokenizer,
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,10 +12,10 @@ import (
|
||||||
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||||
opts := ModelOptions(backendConfig, appConfig)
|
opts := ModelOptions(backendConfig, appConfig)
|
||||||
rerankModel, err := loader.Load(opts...)
|
rerankModel, err := loader.Load(opts...)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
if rerankModel == nil {
|
if rerankModel == nil {
|
||||||
return nil, fmt.Errorf("could not load rerank model")
|
return nil, fmt.Errorf("could not load rerank model")
|
||||||
|
|
|
@ -26,21 +26,26 @@ func SoundGeneration(
|
||||||
|
|
||||||
opts := ModelOptions(backendConfig, appConfig)
|
opts := ModelOptions(backendConfig, appConfig)
|
||||||
soundGenModel, err := loader.Load(opts...)
|
soundGenModel, err := loader.Load(opts...)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
if soundGenModel == nil {
|
if soundGenModel == nil {
|
||||||
return "", nil, fmt.Errorf("could not load sound generation model")
|
return "", nil, fmt.Errorf("could not load sound generation model")
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
|
if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil {
|
||||||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
|
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
|
||||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
if err := os.MkdirAll(audioDir, 0750); err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
|
||||||
|
filePath := filepath.Join(audioDir, fileName)
|
||||||
|
|
||||||
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
|
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
|
||||||
Text: text,
|
Text: text,
|
||||||
|
|
|
@ -20,6 +20,7 @@ func TokenMetrics(
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
if model == nil {
|
if model == nil {
|
||||||
return nil, fmt.Errorf("could not loadmodel model")
|
return nil, fmt.Errorf("could not loadmodel model")
|
||||||
|
|
|
@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
|
||||||
|
|
||||||
opts := ModelOptions(backendConfig, appConfig)
|
opts := ModelOptions(backendConfig, appConfig)
|
||||||
inferenceModel, err = loader.Load(opts...)
|
inferenceModel, err = loader.Load(opts...)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return schema.TokenizeResponse{}, err
|
return schema.TokenizeResponse{}, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
|
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
|
||||||
predictOptions.Prompt = s
|
predictOptions.Prompt = s
|
||||||
|
|
|
@ -24,6 +24,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer ml.Close()
|
||||||
|
|
||||||
if transcriptionModel == nil {
|
if transcriptionModel == nil {
|
||||||
return nil, fmt.Errorf("could not load transcription model")
|
return nil, fmt.Errorf("could not load transcription model")
|
||||||
|
|
|
@ -23,21 +23,22 @@ func ModelTTS(
|
||||||
) (string, *proto.Result, error) {
|
) (string, *proto.Result, error) {
|
||||||
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
|
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
|
||||||
ttsModel, err := loader.Load(opts...)
|
ttsModel, err := loader.Load(opts...)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
if ttsModel == nil {
|
if ttsModel == nil {
|
||||||
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
|
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
|
audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
|
||||||
|
if err := os.MkdirAll(audioDir, 0750); err != nil {
|
||||||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
|
fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
|
||||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
filePath := filepath.Join(audioDir, fileName)
|
||||||
|
|
||||||
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
|
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
|
||||||
// This should be addressed in a follow up PR soon.
|
// This should be addressed in a follow up PR soon.
|
||||||
|
|
|
@ -19,6 +19,8 @@ func VAD(request *schema.VADRequest,
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
defer ml.Close()
|
||||||
|
|
||||||
req := proto.VADRequest{
|
req := proto.VADRequest{
|
||||||
Audio: request.Audio,
|
Audio: request.Audio,
|
||||||
}
|
}
|
||||||
|
|
36
core/backend/video.go
Normal file
36
core/backend/video.go
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
package backend
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/mudler/LocalAI/core/config"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
model "github.com/mudler/LocalAI/pkg/model"
|
||||||
|
)
|
||||||
|
|
||||||
|
func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||||
|
|
||||||
|
opts := ModelOptions(backendConfig, appConfig)
|
||||||
|
inferenceModel, err := loader.Load(
|
||||||
|
opts...,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer loader.Close()
|
||||||
|
|
||||||
|
fn := func() error {
|
||||||
|
_, err := inferenceModel.GenerateVideo(
|
||||||
|
appConfig.Context,
|
||||||
|
&proto.GenerateVideoRequest{
|
||||||
|
Height: height,
|
||||||
|
Width: width,
|
||||||
|
Prompt: prompt,
|
||||||
|
StartImage: startImage,
|
||||||
|
EndImage: endImage,
|
||||||
|
Dst: dst,
|
||||||
|
})
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return fn, nil
|
||||||
|
}
|
|
@ -1,11 +1,13 @@
|
||||||
package cliContext
|
package cliContext
|
||||||
|
|
||||||
import "embed"
|
import (
|
||||||
|
rice "github.com/GeertJohan/go.rice"
|
||||||
|
)
|
||||||
|
|
||||||
type Context struct {
|
type Context struct {
|
||||||
Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
|
Debug bool `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
|
||||||
LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
|
LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
|
||||||
|
|
||||||
// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
|
// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
|
||||||
BackendAssets embed.FS `kong:"-"`
|
BackendAssets *rice.Box `kong:"-"`
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,8 +21,7 @@ type RunCMD struct {
|
||||||
|
|
||||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||||
ImagePath string `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
|
GeneratedContentPath string `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"`
|
||||||
AudioPath string `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
|
|
||||||
UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
|
UploadPath string `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
|
||||||
ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
|
ConfigPath string `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
|
||||||
LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
|
LocalaiConfigDir string `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
|
||||||
|
@ -38,7 +37,7 @@ type RunCMD struct {
|
||||||
|
|
||||||
F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
|
F16 bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
|
||||||
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
|
Threads int `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
|
||||||
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
|
ContextSize int `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
|
||||||
|
|
||||||
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
Address string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
|
||||||
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
CORS bool `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
|
||||||
|
@ -47,7 +46,7 @@ type RunCMD struct {
|
||||||
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
CSRF bool `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
|
||||||
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
UploadLimit int `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
|
||||||
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
APIKeys []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
|
||||||
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
|
DisableWebUI bool `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"`
|
||||||
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
DisablePredownloadScan bool `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
|
||||||
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
OpaqueErrors bool `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
|
||||||
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
|
UseSubtleKeyComparison bool `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
|
||||||
|
@ -81,8 +80,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||||
config.WithModelPath(r.ModelsPath),
|
config.WithModelPath(r.ModelsPath),
|
||||||
config.WithContextSize(r.ContextSize),
|
config.WithContextSize(r.ContextSize),
|
||||||
config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
|
config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
|
||||||
config.WithImageDir(r.ImagePath),
|
config.WithGeneratedContentDir(r.GeneratedContentPath),
|
||||||
config.WithAudioDir(r.AudioPath),
|
|
||||||
config.WithUploadDir(r.UploadPath),
|
config.WithUploadDir(r.UploadPath),
|
||||||
config.WithConfigsDir(r.ConfigPath),
|
config.WithConfigsDir(r.ConfigPath),
|
||||||
config.WithDynamicConfigDir(r.LocalaiConfigDir),
|
config.WithDynamicConfigDir(r.LocalaiConfigDir),
|
||||||
|
|
|
@ -70,11 +70,11 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
|
||||||
opts := &config.ApplicationConfig{
|
opts := &config.ApplicationConfig{
|
||||||
ModelPath: t.ModelsPath,
|
ModelPath: t.ModelsPath,
|
||||||
Context: context.Background(),
|
Context: context.Background(),
|
||||||
AudioDir: outputDir,
|
GeneratedContentDir: outputDir,
|
||||||
AssetsDestination: t.BackendAssetsPath,
|
AssetsDestination: t.BackendAssetsPath,
|
||||||
ExternalGRPCBackends: externalBackends,
|
ExternalGRPCBackends: externalBackends,
|
||||||
}
|
}
|
||||||
ml := model.NewModelLoader(opts.ModelPath)
|
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
err := ml.StopAllGRPC()
|
err := ml.StopAllGRPC()
|
||||||
|
|
|
@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
cl := config.NewBackendConfigLoader(t.ModelsPath)
|
cl := config.NewBackendConfigLoader(t.ModelsPath)
|
||||||
ml := model.NewModelLoader(opts.ModelPath)
|
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
|
||||||
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
|
if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,12 +36,12 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
|
||||||
text := strings.Join(t.Text, " ")
|
text := strings.Join(t.Text, " ")
|
||||||
|
|
||||||
opts := &config.ApplicationConfig{
|
opts := &config.ApplicationConfig{
|
||||||
ModelPath: t.ModelsPath,
|
ModelPath: t.ModelsPath,
|
||||||
Context: context.Background(),
|
Context: context.Background(),
|
||||||
AudioDir: outputDir,
|
GeneratedContentDir: outputDir,
|
||||||
AssetsDestination: t.BackendAssetsPath,
|
AssetsDestination: t.BackendAssetsPath,
|
||||||
}
|
}
|
||||||
ml := model.NewModelLoader(opts.ModelPath)
|
ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
err := ml.StopAllGRPC()
|
err := ml.StopAllGRPC()
|
||||||
|
|
|
@ -7,11 +7,11 @@ import (
|
||||||
|
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
|
|
||||||
|
gguf "github.com/gpustack/gguf-parser-go"
|
||||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||||
"github.com/mudler/LocalAI/core/config"
|
"github.com/mudler/LocalAI/core/config"
|
||||||
"github.com/mudler/LocalAI/core/gallery"
|
"github.com/mudler/LocalAI/core/gallery"
|
||||||
"github.com/mudler/LocalAI/pkg/downloader"
|
"github.com/mudler/LocalAI/pkg/downloader"
|
||||||
gguf "github.com/thxcode/gguf-parser-go"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type UtilCMD struct {
|
type UtilCMD struct {
|
||||||
|
@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
|
||||||
log.Info().
|
log.Info().
|
||||||
Any("eosTokenID", f.Tokenizer().EOSTokenID).
|
Any("eosTokenID", f.Tokenizer().EOSTokenID).
|
||||||
Any("bosTokenID", f.Tokenizer().BOSTokenID).
|
Any("bosTokenID", f.Tokenizer().BOSTokenID).
|
||||||
Any("modelName", f.Model().Name).
|
Any("modelName", f.Metadata().Name).
|
||||||
Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
|
Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
|
||||||
|
|
||||||
log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
|
log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
|
||||||
|
|
|
@ -2,11 +2,11 @@ package config
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"embed"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
rice "github.com/GeertJohan/go.rice"
|
||||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
)
|
)
|
||||||
|
@ -19,20 +19,21 @@ type ApplicationConfig struct {
|
||||||
UploadLimitMB, Threads, ContextSize int
|
UploadLimitMB, Threads, ContextSize int
|
||||||
F16 bool
|
F16 bool
|
||||||
Debug bool
|
Debug bool
|
||||||
ImageDir string
|
GeneratedContentDir string
|
||||||
AudioDir string
|
|
||||||
UploadDir string
|
ConfigsDir string
|
||||||
ConfigsDir string
|
UploadDir string
|
||||||
DynamicConfigsDir string
|
|
||||||
DynamicConfigsDirPollInterval time.Duration
|
DynamicConfigsDir string
|
||||||
CORS bool
|
DynamicConfigsDirPollInterval time.Duration
|
||||||
CSRF bool
|
CORS bool
|
||||||
PreloadJSONModels string
|
CSRF bool
|
||||||
PreloadModelsFromPath string
|
PreloadJSONModels string
|
||||||
CORSAllowOrigins string
|
PreloadModelsFromPath string
|
||||||
ApiKeys []string
|
CORSAllowOrigins string
|
||||||
P2PToken string
|
ApiKeys []string
|
||||||
P2PNetworkID string
|
P2PToken string
|
||||||
|
P2PNetworkID string
|
||||||
|
|
||||||
DisableWebUI bool
|
DisableWebUI bool
|
||||||
EnforcePredownloadScans bool
|
EnforcePredownloadScans bool
|
||||||
|
@ -46,7 +47,7 @@ type ApplicationConfig struct {
|
||||||
|
|
||||||
Galleries []Gallery
|
Galleries []Gallery
|
||||||
|
|
||||||
BackendAssets embed.FS
|
BackendAssets *rice.Box
|
||||||
AssetsDestination string
|
AssetsDestination string
|
||||||
|
|
||||||
ExternalGRPCBackends map[string]string
|
ExternalGRPCBackends map[string]string
|
||||||
|
@ -197,7 +198,7 @@ func WithBackendAssetsOutput(out string) AppOption {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithBackendAssets(f embed.FS) AppOption {
|
func WithBackendAssets(f *rice.Box) AppOption {
|
||||||
return func(o *ApplicationConfig) {
|
return func(o *ApplicationConfig) {
|
||||||
o.BackendAssets = f
|
o.BackendAssets = f
|
||||||
}
|
}
|
||||||
|
@ -279,15 +280,9 @@ func WithDebug(debug bool) AppOption {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithAudioDir(audioDir string) AppOption {
|
func WithGeneratedContentDir(generatedContentDir string) AppOption {
|
||||||
return func(o *ApplicationConfig) {
|
return func(o *ApplicationConfig) {
|
||||||
o.AudioDir = audioDir
|
o.GeneratedContentDir = generatedContentDir
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func WithImageDir(imageDir string) AppOption {
|
|
||||||
return func(o *ApplicationConfig) {
|
|
||||||
o.ImageDir = imageDir
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,9 +50,6 @@ type BackendConfig struct {
|
||||||
// LLM configs (GPT4ALL, Llama.cpp, ...)
|
// LLM configs (GPT4ALL, Llama.cpp, ...)
|
||||||
LLMConfig `yaml:",inline"`
|
LLMConfig `yaml:",inline"`
|
||||||
|
|
||||||
// AutoGPTQ specifics
|
|
||||||
AutoGPTQ AutoGPTQ `yaml:"autogptq"`
|
|
||||||
|
|
||||||
// Diffusers
|
// Diffusers
|
||||||
Diffusers Diffusers `yaml:"diffusers"`
|
Diffusers Diffusers `yaml:"diffusers"`
|
||||||
Step int `yaml:"step"`
|
Step int `yaml:"step"`
|
||||||
|
@ -130,25 +127,28 @@ type LLMConfig struct {
|
||||||
TrimSpace []string `yaml:"trimspace"`
|
TrimSpace []string `yaml:"trimspace"`
|
||||||
TrimSuffix []string `yaml:"trimsuffix"`
|
TrimSuffix []string `yaml:"trimsuffix"`
|
||||||
|
|
||||||
ContextSize *int `yaml:"context_size"`
|
ContextSize *int `yaml:"context_size"`
|
||||||
NUMA bool `yaml:"numa"`
|
NUMA bool `yaml:"numa"`
|
||||||
LoraAdapter string `yaml:"lora_adapter"`
|
LoraAdapter string `yaml:"lora_adapter"`
|
||||||
LoraBase string `yaml:"lora_base"`
|
LoraBase string `yaml:"lora_base"`
|
||||||
LoraAdapters []string `yaml:"lora_adapters"`
|
LoraAdapters []string `yaml:"lora_adapters"`
|
||||||
LoraScales []float32 `yaml:"lora_scales"`
|
LoraScales []float32 `yaml:"lora_scales"`
|
||||||
LoraScale float32 `yaml:"lora_scale"`
|
LoraScale float32 `yaml:"lora_scale"`
|
||||||
NoMulMatQ bool `yaml:"no_mulmatq"`
|
NoMulMatQ bool `yaml:"no_mulmatq"`
|
||||||
DraftModel string `yaml:"draft_model"`
|
DraftModel string `yaml:"draft_model"`
|
||||||
NDraft int32 `yaml:"n_draft"`
|
NDraft int32 `yaml:"n_draft"`
|
||||||
Quantization string `yaml:"quantization"`
|
Quantization string `yaml:"quantization"`
|
||||||
LoadFormat string `yaml:"load_format"`
|
LoadFormat string `yaml:"load_format"`
|
||||||
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
|
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
|
||||||
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
|
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
|
||||||
EnforceEager bool `yaml:"enforce_eager"` // vLLM
|
EnforceEager bool `yaml:"enforce_eager"` // vLLM
|
||||||
SwapSpace int `yaml:"swap_space"` // vLLM
|
SwapSpace int `yaml:"swap_space"` // vLLM
|
||||||
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
MaxModelLen int `yaml:"max_model_len"` // vLLM
|
||||||
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
|
||||||
MMProj string `yaml:"mmproj"`
|
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
|
||||||
|
DType string `yaml:"dtype"` // vLLM
|
||||||
|
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
|
||||||
|
MMProj string `yaml:"mmproj"`
|
||||||
|
|
||||||
FlashAttention bool `yaml:"flash_attention"`
|
FlashAttention bool `yaml:"flash_attention"`
|
||||||
NoKVOffloading bool `yaml:"no_kv_offloading"`
|
NoKVOffloading bool `yaml:"no_kv_offloading"`
|
||||||
|
@ -166,12 +166,11 @@ type LLMConfig struct {
|
||||||
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
|
||||||
}
|
}
|
||||||
|
|
||||||
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
|
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
|
||||||
type AutoGPTQ struct {
|
type LimitMMPerPrompt struct {
|
||||||
ModelBaseName string `yaml:"model_base_name"`
|
LimitImagePerPrompt int `yaml:"image"`
|
||||||
Device string `yaml:"device"`
|
LimitVideoPerPrompt int `yaml:"video"`
|
||||||
Triton bool `yaml:"triton"`
|
LimitAudioPerPrompt int `yaml:"audio"`
|
||||||
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TemplateConfig is a struct that holds the configuration of the templating system
|
// TemplateConfig is a struct that holds the configuration of the templating system
|
||||||
|
@ -203,6 +202,8 @@ type TemplateConfig struct {
|
||||||
Multimodal string `yaml:"multimodal"`
|
Multimodal string `yaml:"multimodal"`
|
||||||
|
|
||||||
JinjaTemplate bool `yaml:"jinja_template"`
|
JinjaTemplate bool `yaml:"jinja_template"`
|
||||||
|
|
||||||
|
ReplyPrefix string `yaml:"reply_prefix"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
|
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
|
||||||
|
@ -212,7 +213,15 @@ func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
*c = BackendConfig(aux)
|
*c = BackendConfig(aux)
|
||||||
|
|
||||||
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
|
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
|
||||||
|
// Make sure the usecases are valid, we rewrite with what we identified
|
||||||
|
c.KnownUsecaseStrings = []string{}
|
||||||
|
for k, usecase := range GetAllBackendConfigUsecases() {
|
||||||
|
if c.HasUsecases(usecase) {
|
||||||
|
c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -295,9 +304,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||||
defaultTFZ := 1.0
|
defaultTFZ := 1.0
|
||||||
defaultZero := 0
|
defaultZero := 0
|
||||||
|
|
||||||
// Try to offload all GPU layers (if GPU is found)
|
|
||||||
defaultHigh := 99999999
|
|
||||||
|
|
||||||
trueV := true
|
trueV := true
|
||||||
falseV := false
|
falseV := false
|
||||||
|
|
||||||
|
@ -357,9 +363,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||||
if cfg.MirostatTAU == nil {
|
if cfg.MirostatTAU == nil {
|
||||||
cfg.MirostatTAU = &defaultMirostatTAU
|
cfg.MirostatTAU = &defaultMirostatTAU
|
||||||
}
|
}
|
||||||
if cfg.NGPULayers == nil {
|
|
||||||
cfg.NGPULayers = &defaultHigh
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.LowVRAM == nil {
|
if cfg.LowVRAM == nil {
|
||||||
cfg.LowVRAM = &falseV
|
cfg.LowVRAM = &falseV
|
||||||
|
@ -369,16 +372,6 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||||
cfg.Embeddings = &falseV
|
cfg.Embeddings = &falseV
|
||||||
}
|
}
|
||||||
|
|
||||||
// Value passed by the top level are treated as default (no implicit defaults)
|
|
||||||
// defaults are set by the user
|
|
||||||
if ctx == 0 {
|
|
||||||
ctx = 1024
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.ContextSize == nil {
|
|
||||||
cfg.ContextSize = &ctx
|
|
||||||
}
|
|
||||||
|
|
||||||
if threads == 0 {
|
if threads == 0 {
|
||||||
// Threads can't be 0
|
// Threads can't be 0
|
||||||
threads = 4
|
threads = 4
|
||||||
|
@ -400,7 +393,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||||
cfg.Debug = &trueV
|
cfg.Debug = &trueV
|
||||||
}
|
}
|
||||||
|
|
||||||
guessDefaultsFromFile(cfg, lo.modelPath)
|
guessDefaultsFromFile(cfg, lo.modelPath, ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *BackendConfig) Validate() bool {
|
func (c *BackendConfig) Validate() bool {
|
||||||
|
@ -437,18 +430,19 @@ func (c *BackendConfig) HasTemplate() bool {
|
||||||
type BackendConfigUsecases int
|
type BackendConfigUsecases int
|
||||||
|
|
||||||
const (
|
const (
|
||||||
FLAG_ANY BackendConfigUsecases = 0b00000000000
|
FLAG_ANY BackendConfigUsecases = 0b000000000000
|
||||||
FLAG_CHAT BackendConfigUsecases = 0b00000000001
|
FLAG_CHAT BackendConfigUsecases = 0b000000000001
|
||||||
FLAG_COMPLETION BackendConfigUsecases = 0b00000000010
|
FLAG_COMPLETION BackendConfigUsecases = 0b000000000010
|
||||||
FLAG_EDIT BackendConfigUsecases = 0b00000000100
|
FLAG_EDIT BackendConfigUsecases = 0b000000000100
|
||||||
FLAG_EMBEDDINGS BackendConfigUsecases = 0b00000001000
|
FLAG_EMBEDDINGS BackendConfigUsecases = 0b000000001000
|
||||||
FLAG_RERANK BackendConfigUsecases = 0b00000010000
|
FLAG_RERANK BackendConfigUsecases = 0b000000010000
|
||||||
FLAG_IMAGE BackendConfigUsecases = 0b00000100000
|
FLAG_IMAGE BackendConfigUsecases = 0b000000100000
|
||||||
FLAG_TRANSCRIPT BackendConfigUsecases = 0b00001000000
|
FLAG_TRANSCRIPT BackendConfigUsecases = 0b000001000000
|
||||||
FLAG_TTS BackendConfigUsecases = 0b00010000000
|
FLAG_TTS BackendConfigUsecases = 0b000010000000
|
||||||
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
|
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b000100000000
|
||||||
FLAG_TOKENIZE BackendConfigUsecases = 0b01000000000
|
FLAG_TOKENIZE BackendConfigUsecases = 0b001000000000
|
||||||
FLAG_VAD BackendConfigUsecases = 0b10000000000
|
FLAG_VAD BackendConfigUsecases = 0b010000000000
|
||||||
|
FLAG_VIDEO BackendConfigUsecases = 0b100000000000
|
||||||
|
|
||||||
// Common Subsets
|
// Common Subsets
|
||||||
FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
|
||||||
|
@ -469,9 +463,14 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
|
||||||
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
"FLAG_TOKENIZE": FLAG_TOKENIZE,
|
||||||
"FLAG_VAD": FLAG_VAD,
|
"FLAG_VAD": FLAG_VAD,
|
||||||
"FLAG_LLM": FLAG_LLM,
|
"FLAG_LLM": FLAG_LLM,
|
||||||
|
"FLAG_VIDEO": FLAG_VIDEO,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func stringToFlag(s string) string {
|
||||||
|
return "FLAG_" + strings.ToUpper(s)
|
||||||
|
}
|
||||||
|
|
||||||
func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
|
func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
|
||||||
if len(input) == 0 {
|
if len(input) == 0 {
|
||||||
return nil
|
return nil
|
||||||
|
@ -479,7 +478,7 @@ func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
|
||||||
result := FLAG_ANY
|
result := FLAG_ANY
|
||||||
flags := GetAllBackendConfigUsecases()
|
flags := GetAllBackendConfigUsecases()
|
||||||
for _, str := range input {
|
for _, str := range input {
|
||||||
flag, exists := flags["FLAG_"+strings.ToUpper(str)]
|
flag, exists := flags[stringToFlag(str)]
|
||||||
if exists {
|
if exists {
|
||||||
result |= flag
|
result |= flag
|
||||||
}
|
}
|
||||||
|
@ -529,6 +528,17 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (u & FLAG_VIDEO) == FLAG_VIDEO {
|
||||||
|
videoBackends := []string{"diffusers", "stablediffusion"}
|
||||||
|
if !slices.Contains(videoBackends, c.Backend) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if (u & FLAG_RERANK) == FLAG_RERANK {
|
if (u & FLAG_RERANK) == FLAG_RERANK {
|
||||||
if c.Backend != "rerankers" {
|
if c.Backend != "rerankers" {
|
||||||
|
@ -541,7 +551,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (u & FLAG_TTS) == FLAG_TTS {
|
if (u & FLAG_TTS) == FLAG_TTS {
|
||||||
ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
|
ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
|
||||||
if !slices.Contains(ttsBackends, c.Backend) {
|
if !slices.Contains(ttsBackends, c.Backend) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
296
core/config/gguf.go
Normal file
296
core/config/gguf.go
Normal file
|
@ -0,0 +1,296 @@
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||||
|
"github.com/rs/zerolog/log"
|
||||||
|
|
||||||
|
gguf "github.com/gpustack/gguf-parser-go"
|
||||||
|
)
|
||||||
|
|
||||||
|
type familyType uint8
|
||||||
|
|
||||||
|
const (
|
||||||
|
Unknown familyType = iota
|
||||||
|
LLaMa3
|
||||||
|
CommandR
|
||||||
|
Phi3
|
||||||
|
ChatML
|
||||||
|
Mistral03
|
||||||
|
Gemma
|
||||||
|
DeepSeek2
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
defaultContextSize = 1024
|
||||||
|
defaultNGPULayers = 99999999
|
||||||
|
)
|
||||||
|
|
||||||
|
type settingsConfig struct {
|
||||||
|
StopWords []string
|
||||||
|
TemplateConfig TemplateConfig
|
||||||
|
RepeatPenalty float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// default settings to adopt with a given model family
|
||||||
|
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
|
||||||
|
Gemma: {
|
||||||
|
RepeatPenalty: 1.0,
|
||||||
|
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "{{.Input }}\n<start_of_turn>model\n",
|
||||||
|
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
|
||||||
|
Completion: "{{.Input}}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
DeepSeek2: {
|
||||||
|
StopWords: []string{"<|end▁of▁sentence|>"},
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
|
||||||
|
{{ end -}}
|
||||||
|
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<|end▁of▁sentence|>{{end}}
|
||||||
|
{{if eq .RoleName "system" -}}{{.Content}}
|
||||||
|
{{end -}}`,
|
||||||
|
Chat: "{{.Input -}}\nAssistant: ",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
LLaMa3: {
|
||||||
|
StopWords: []string{"<|eot_id|>"},
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
|
||||||
|
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
CommandR: {
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||||
|
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
|
||||||
|
You are a function calling AI model, you can call the following functions:
|
||||||
|
## Available Tools
|
||||||
|
{{range .Functions}}
|
||||||
|
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
|
||||||
|
{{end}}
|
||||||
|
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
|
||||||
|
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
|
||||||
|
ChatMessage: `{{if eq .RoleName "user" -}}
|
||||||
|
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
||||||
|
{{- else if eq .RoleName "system" -}}
|
||||||
|
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
||||||
|
{{- else if eq .RoleName "assistant" -}}
|
||||||
|
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
||||||
|
{{- else if eq .RoleName "tool" -}}
|
||||||
|
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
||||||
|
{{- else if .FunctionCall -}}
|
||||||
|
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
|
||||||
|
{{- end -}}`,
|
||||||
|
},
|
||||||
|
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
|
||||||
|
},
|
||||||
|
Phi3: {
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "{{.Input}}\n<|assistant|>",
|
||||||
|
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
|
||||||
|
Completion: "{{.Input}}",
|
||||||
|
},
|
||||||
|
StopWords: []string{"<|end|>", "<|endoftext|>"},
|
||||||
|
},
|
||||||
|
ChatML: {
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "{{.Input -}}\n<|im_start|>assistant",
|
||||||
|
Functions: `<|im_start|>system
|
||||||
|
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
||||||
|
{{range .Functions}}
|
||||||
|
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||||
|
{{end}}
|
||||||
|
For each function call return a json object with function name and arguments
|
||||||
|
<|im_end|>
|
||||||
|
{{.Input -}}
|
||||||
|
<|im_start|>assistant`,
|
||||||
|
ChatMessage: `<|im_start|>{{ .RoleName }}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
Function call:
|
||||||
|
{{ else if eq .RoleName "tool" -}}
|
||||||
|
Function response:
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .Content -}}
|
||||||
|
{{.Content }}
|
||||||
|
{{ end -}}
|
||||||
|
{{ if .FunctionCall -}}
|
||||||
|
{{toJson .FunctionCall}}
|
||||||
|
{{ end -}}<|im_end|>`,
|
||||||
|
},
|
||||||
|
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
|
||||||
|
},
|
||||||
|
Mistral03: {
|
||||||
|
TemplateConfig: TemplateConfig{
|
||||||
|
Chat: "{{.Input -}}",
|
||||||
|
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
|
||||||
|
ChatMessage: `{{if eq .RoleName "user" -}}
|
||||||
|
[INST] {{.Content }} [/INST]
|
||||||
|
{{- else if .FunctionCall -}}
|
||||||
|
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
|
||||||
|
{{- else if eq .RoleName "tool" -}}
|
||||||
|
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
|
||||||
|
{{- else -}}
|
||||||
|
{{ .Content -}}
|
||||||
|
{{ end -}}`,
|
||||||
|
},
|
||||||
|
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// this maps well known template used in HF to model families defined above
|
||||||
|
var knownTemplates = map[string]familyType{
|
||||||
|
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
|
||||||
|
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
|
||||||
|
}
|
||||||
|
|
||||||
|
func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
|
||||||
|
|
||||||
|
if defaultCtx == 0 && cfg.ContextSize == nil {
|
||||||
|
ctxSize := f.EstimateLLaMACppRun().ContextSize
|
||||||
|
if ctxSize > 0 {
|
||||||
|
cSize := int(ctxSize)
|
||||||
|
cfg.ContextSize = &cSize
|
||||||
|
} else {
|
||||||
|
defaultCtx = defaultContextSize
|
||||||
|
cfg.ContextSize = &defaultCtx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU options
|
||||||
|
if cfg.Options == nil {
|
||||||
|
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
|
||||||
|
cfg.Options = []string{"gpu"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// vram estimation
|
||||||
|
vram, err := xsysinfo.TotalAvailableVRAM()
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
|
||||||
|
} else if vram > 0 {
|
||||||
|
estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
|
||||||
|
if err != nil {
|
||||||
|
log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
|
||||||
|
} else {
|
||||||
|
if estimate.IsFullOffload {
|
||||||
|
log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
|
||||||
|
}
|
||||||
|
|
||||||
|
if estimate.EstimatedVRAM > vram {
|
||||||
|
log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
|
||||||
|
log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
|
||||||
|
cfg.NGPULayers = &estimate.EstimatedLayers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.NGPULayers == nil {
|
||||||
|
// we assume we want to offload all layers
|
||||||
|
defaultHigh := defaultNGPULayers
|
||||||
|
cfg.NGPULayers = &defaultHigh
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
|
||||||
|
|
||||||
|
// template estimations
|
||||||
|
if cfg.HasTemplate() {
|
||||||
|
// nothing to guess here
|
||||||
|
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debug().
|
||||||
|
Any("eosTokenID", f.Tokenizer().EOSTokenID).
|
||||||
|
Any("bosTokenID", f.Tokenizer().BOSTokenID).
|
||||||
|
Any("modelName", f.Metadata().Name).
|
||||||
|
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
|
||||||
|
|
||||||
|
// guess the name
|
||||||
|
if cfg.Name == "" {
|
||||||
|
cfg.Name = f.Metadata().Name
|
||||||
|
}
|
||||||
|
|
||||||
|
family := identifyFamily(f)
|
||||||
|
|
||||||
|
if family == Unknown {
|
||||||
|
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// identify template
|
||||||
|
settings, ok := defaultsSettings[family]
|
||||||
|
if ok {
|
||||||
|
cfg.TemplateConfig = settings.TemplateConfig
|
||||||
|
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
|
||||||
|
if len(cfg.StopWords) == 0 {
|
||||||
|
cfg.StopWords = settings.StopWords
|
||||||
|
}
|
||||||
|
if cfg.RepeatPenalty == 0.0 {
|
||||||
|
cfg.RepeatPenalty = settings.RepeatPenalty
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.HasTemplate() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// identify from well known templates first, otherwise use the raw jinja template
|
||||||
|
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
||||||
|
if found {
|
||||||
|
// try to use the jinja template
|
||||||
|
cfg.TemplateConfig.JinjaTemplate = true
|
||||||
|
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func identifyFamily(f *gguf.GGUFFile) familyType {
|
||||||
|
|
||||||
|
// identify from well known templates first
|
||||||
|
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
||||||
|
if found && chatTemplate.ValueString() != "" {
|
||||||
|
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
|
||||||
|
return family
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise try to identify from the model properties
|
||||||
|
arch := f.Architecture().Architecture
|
||||||
|
eosTokenID := f.Tokenizer().EOSTokenID
|
||||||
|
bosTokenID := f.Tokenizer().BOSTokenID
|
||||||
|
|
||||||
|
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
|
||||||
|
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
|
||||||
|
|
||||||
|
llama3 := arch == "llama" && eosTokenID == 128009
|
||||||
|
commandR := arch == "command-r" && eosTokenID == 255001
|
||||||
|
qwen2 := arch == "qwen2"
|
||||||
|
phi3 := arch == "phi-3"
|
||||||
|
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
|
||||||
|
deepseek2 := arch == "deepseek2"
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case deepseek2:
|
||||||
|
return DeepSeek2
|
||||||
|
case gemma:
|
||||||
|
return Gemma
|
||||||
|
case llama3:
|
||||||
|
return LLaMa3
|
||||||
|
case commandR:
|
||||||
|
return CommandR
|
||||||
|
case phi3:
|
||||||
|
return Phi3
|
||||||
|
case qwen2, isYI:
|
||||||
|
return ChatML
|
||||||
|
default:
|
||||||
|
return Unknown
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,147 +3,12 @@ package config
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
|
||||||
|
|
||||||
|
gguf "github.com/gpustack/gguf-parser-go"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
|
|
||||||
gguf "github.com/thxcode/gguf-parser-go"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type familyType uint8
|
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
|
||||||
|
|
||||||
const (
|
|
||||||
Unknown familyType = iota
|
|
||||||
LLaMa3
|
|
||||||
CommandR
|
|
||||||
Phi3
|
|
||||||
ChatML
|
|
||||||
Mistral03
|
|
||||||
Gemma
|
|
||||||
DeepSeek2
|
|
||||||
)
|
|
||||||
|
|
||||||
type settingsConfig struct {
|
|
||||||
StopWords []string
|
|
||||||
TemplateConfig TemplateConfig
|
|
||||||
RepeatPenalty float64
|
|
||||||
}
|
|
||||||
|
|
||||||
// default settings to adopt with a given model family
|
|
||||||
var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
|
|
||||||
Gemma: {
|
|
||||||
RepeatPenalty: 1.0,
|
|
||||||
StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "{{.Input }}\n<start_of_turn>model\n",
|
|
||||||
ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
|
|
||||||
Completion: "{{.Input}}",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
DeepSeek2: {
|
|
||||||
StopWords: []string{"<|end▁of▁sentence|>"},
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
|
|
||||||
{{ end -}}
|
|
||||||
{{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<|end▁of▁sentence|>{{end}}
|
|
||||||
{{if eq .RoleName "system" -}}{{.Content}}
|
|
||||||
{{end -}}`,
|
|
||||||
Chat: "{{.Input -}}\nAssistant: ",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
LLaMa3: {
|
|
||||||
StopWords: []string{"<|eot_id|>"},
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
|
|
||||||
ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
CommandR: {
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
|
||||||
Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
|
|
||||||
You are a function calling AI model, you can call the following functions:
|
|
||||||
## Available Tools
|
|
||||||
{{range .Functions}}
|
|
||||||
- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
|
|
||||||
{{end}}
|
|
||||||
When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
|
|
||||||
<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
|
|
||||||
ChatMessage: `{{if eq .RoleName "user" -}}
|
|
||||||
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
|
||||||
{{- else if eq .RoleName "system" -}}
|
|
||||||
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
|
||||||
{{- else if eq .RoleName "assistant" -}}
|
|
||||||
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
|
||||||
{{- else if eq .RoleName "tool" -}}
|
|
||||||
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
|
|
||||||
{{- else if .FunctionCall -}}
|
|
||||||
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
|
|
||||||
{{- end -}}`,
|
|
||||||
},
|
|
||||||
StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
|
|
||||||
},
|
|
||||||
Phi3: {
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "{{.Input}}\n<|assistant|>",
|
|
||||||
ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
|
|
||||||
Completion: "{{.Input}}",
|
|
||||||
},
|
|
||||||
StopWords: []string{"<|end|>", "<|endoftext|>"},
|
|
||||||
},
|
|
||||||
ChatML: {
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "{{.Input -}}\n<|im_start|>assistant",
|
|
||||||
Functions: `<|im_start|>system
|
|
||||||
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
|
|
||||||
{{range .Functions}}
|
|
||||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
|
||||||
{{end}}
|
|
||||||
For each function call return a json object with function name and arguments
|
|
||||||
<|im_end|>
|
|
||||||
{{.Input -}}
|
|
||||||
<|im_start|>assistant`,
|
|
||||||
ChatMessage: `<|im_start|>{{ .RoleName }}
|
|
||||||
{{ if .FunctionCall -}}
|
|
||||||
Function call:
|
|
||||||
{{ else if eq .RoleName "tool" -}}
|
|
||||||
Function response:
|
|
||||||
{{ end -}}
|
|
||||||
{{ if .Content -}}
|
|
||||||
{{.Content }}
|
|
||||||
{{ end -}}
|
|
||||||
{{ if .FunctionCall -}}
|
|
||||||
{{toJson .FunctionCall}}
|
|
||||||
{{ end -}}<|im_end|>`,
|
|
||||||
},
|
|
||||||
StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
|
|
||||||
},
|
|
||||||
Mistral03: {
|
|
||||||
TemplateConfig: TemplateConfig{
|
|
||||||
Chat: "{{.Input -}}",
|
|
||||||
Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
|
|
||||||
ChatMessage: `{{if eq .RoleName "user" -}}
|
|
||||||
[INST] {{.Content }} [/INST]
|
|
||||||
{{- else if .FunctionCall -}}
|
|
||||||
[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
|
|
||||||
{{- else if eq .RoleName "tool" -}}
|
|
||||||
[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
|
|
||||||
{{- else -}}
|
|
||||||
{{ .Content -}}
|
|
||||||
{{ end -}}`,
|
|
||||||
},
|
|
||||||
StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
// this maps well known template used in HF to model families defined above
|
|
||||||
var knownTemplates = map[string]familyType{
|
|
||||||
`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`: ChatML,
|
|
||||||
`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
|
|
||||||
}
|
|
||||||
|
|
||||||
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
|
|
||||||
|
|
||||||
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
|
||||||
log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
|
log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
|
||||||
return
|
return
|
||||||
|
@ -154,106 +19,20 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if cfg.HasTemplate() {
|
|
||||||
// nothing to guess here
|
|
||||||
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// We try to guess only if we don't have a template defined already
|
// We try to guess only if we don't have a template defined already
|
||||||
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
|
||||||
|
|
||||||
|
// try to parse the gguf file
|
||||||
f, err := gguf.ParseGGUFFile(guessPath)
|
f, err := gguf.ParseGGUFFile(guessPath)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
// Only valid for gguf files
|
guessGGUFFromFile(cfg, f, defaultCtx)
|
||||||
log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug().
|
if cfg.ContextSize == nil {
|
||||||
Any("eosTokenID", f.Tokenizer().EOSTokenID).
|
if defaultCtx == 0 {
|
||||||
Any("bosTokenID", f.Tokenizer().BOSTokenID).
|
defaultCtx = defaultContextSize
|
||||||
Any("modelName", f.Model().Name).
|
|
||||||
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
|
|
||||||
|
|
||||||
// guess the name
|
|
||||||
if cfg.Name == "" {
|
|
||||||
cfg.Name = f.Model().Name
|
|
||||||
}
|
|
||||||
|
|
||||||
family := identifyFamily(f)
|
|
||||||
|
|
||||||
if family == Unknown {
|
|
||||||
log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// identify template
|
|
||||||
settings, ok := defaultsSettings[family]
|
|
||||||
if ok {
|
|
||||||
cfg.TemplateConfig = settings.TemplateConfig
|
|
||||||
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
|
|
||||||
if len(cfg.StopWords) == 0 {
|
|
||||||
cfg.StopWords = settings.StopWords
|
|
||||||
}
|
}
|
||||||
if cfg.RepeatPenalty == 0.0 {
|
cfg.ContextSize = &defaultCtx
|
||||||
cfg.RepeatPenalty = settings.RepeatPenalty
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg.HasTemplate() {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// identify from well known templates first, otherwise use the raw jinja template
|
|
||||||
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
|
||||||
if found {
|
|
||||||
// try to use the jinja template
|
|
||||||
cfg.TemplateConfig.JinjaTemplate = true
|
|
||||||
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func identifyFamily(f *gguf.GGUFFile) familyType {
|
|
||||||
|
|
||||||
// identify from well known templates first
|
|
||||||
chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
|
|
||||||
if found && chatTemplate.ValueString() != "" {
|
|
||||||
if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
|
|
||||||
return family
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// otherwise try to identify from the model properties
|
|
||||||
arch := f.Architecture().Architecture
|
|
||||||
eosTokenID := f.Tokenizer().EOSTokenID
|
|
||||||
bosTokenID := f.Tokenizer().BOSTokenID
|
|
||||||
|
|
||||||
isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
|
|
||||||
// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
|
|
||||||
|
|
||||||
llama3 := arch == "llama" && eosTokenID == 128009
|
|
||||||
commandR := arch == "command-r" && eosTokenID == 255001
|
|
||||||
qwen2 := arch == "qwen2"
|
|
||||||
phi3 := arch == "phi-3"
|
|
||||||
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
|
|
||||||
deepseek2 := arch == "deepseek2"
|
|
||||||
|
|
||||||
switch {
|
|
||||||
case deepseek2:
|
|
||||||
return DeepSeek2
|
|
||||||
case gemma:
|
|
||||||
return Gemma
|
|
||||||
case llama3:
|
|
||||||
return LLaMa3
|
|
||||||
case commandR:
|
|
||||||
return CommandR
|
|
||||||
case phi3:
|
|
||||||
return Phi3
|
|
||||||
case qwen2, isYI:
|
|
||||||
return ChatML
|
|
||||||
default:
|
|
||||||
return Unknown
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue