mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-27 13:15:00 +00:00
Merge branch 'master' into feat-request-middleware
This commit is contained in:
commit
5b01500a60
8 changed files with 87 additions and 350 deletions
40
Makefile
40
Makefile
|
@ -6,9 +6,7 @@ BINARY_NAME=local-ai
|
||||||
DETECT_LIBS?=true
|
DETECT_LIBS?=true
|
||||||
|
|
||||||
# llama.cpp versions
|
# llama.cpp versions
|
||||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
CPPLLAMA_VERSION?=e6e658319952f7ad269dc11275b9edddc721fc6d
|
||||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
|
||||||
CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b
|
|
||||||
|
|
||||||
# whisper.cpp version
|
# whisper.cpp version
|
||||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||||
|
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
|
||||||
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
|
||||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||||
# llama-ggml has no hipblas support, so override it here.
|
|
||||||
export STABLE_BUILD_TYPE=
|
export STABLE_BUILD_TYPE=
|
||||||
export GGML_HIP=1
|
export GGML_HIP=1
|
||||||
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
|
||||||
|
@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
|
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
|
||||||
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
|
||||||
|
@ -222,19 +218,6 @@ endif
|
||||||
|
|
||||||
all: help
|
all: help
|
||||||
|
|
||||||
## go-llama.cpp
|
|
||||||
sources/go-llama.cpp:
|
|
||||||
mkdir -p sources/go-llama.cpp
|
|
||||||
cd sources/go-llama.cpp && \
|
|
||||||
git init && \
|
|
||||||
git remote add origin $(GOLLAMA_REPO) && \
|
|
||||||
git fetch origin && \
|
|
||||||
git checkout $(GOLLAMA_VERSION) && \
|
|
||||||
git submodule update --init --recursive --depth 1 --single-branch
|
|
||||||
|
|
||||||
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
|
|
||||||
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
|
|
||||||
|
|
||||||
## bark.cpp
|
## bark.cpp
|
||||||
sources/bark.cpp:
|
sources/bark.cpp:
|
||||||
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
|
||||||
|
@ -310,19 +293,17 @@ sources/whisper.cpp:
|
||||||
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
|
||||||
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
|
||||||
|
|
||||||
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
|
||||||
|
|
||||||
replace:
|
replace:
|
||||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
|
||||||
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
|
||||||
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
|
||||||
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
|
|
||||||
|
|
||||||
dropreplace:
|
dropreplace:
|
||||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
|
||||||
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
|
||||||
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
|
||||||
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
|
|
||||||
|
|
||||||
prepare-sources: get-sources replace
|
prepare-sources: get-sources replace
|
||||||
$(GOCMD) mod download
|
$(GOCMD) mod download
|
||||||
|
@ -330,7 +311,6 @@ prepare-sources: get-sources replace
|
||||||
## GENERIC
|
## GENERIC
|
||||||
rebuild: ## Rebuilds the project
|
rebuild: ## Rebuilds the project
|
||||||
$(GOCMD) clean -cache
|
$(GOCMD) clean -cache
|
||||||
$(MAKE) -C sources/go-llama.cpp clean
|
|
||||||
$(MAKE) -C sources/whisper.cpp clean
|
$(MAKE) -C sources/whisper.cpp clean
|
||||||
$(MAKE) -C sources/go-piper clean
|
$(MAKE) -C sources/go-piper clean
|
||||||
$(MAKE) build
|
$(MAKE) build
|
||||||
|
@ -434,7 +414,7 @@ run: prepare ## run local-ai
|
||||||
test-models/testmodel.ggml:
|
test-models/testmodel.ggml:
|
||||||
mkdir test-models
|
mkdir test-models
|
||||||
mkdir test-dir
|
mkdir test-dir
|
||||||
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
|
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
|
||||||
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
|
||||||
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
|
||||||
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
|
||||||
|
@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
|
||||||
export GO_TAGS="tts debug"
|
export GO_TAGS="tts debug"
|
||||||
$(MAKE) prepare-test
|
$(MAKE) prepare-test
|
||||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||||
$(MAKE) test-llama
|
|
||||||
$(MAKE) test-llama-gguf
|
$(MAKE) test-llama-gguf
|
||||||
$(MAKE) test-tts
|
$(MAKE) test-tts
|
||||||
$(MAKE) test-stablediffusion
|
$(MAKE) test-stablediffusion
|
||||||
|
@ -479,10 +458,6 @@ teardown-e2e:
|
||||||
rm -rf $(TEST_DIR) || true
|
rm -rf $(TEST_DIR) || true
|
||||||
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
docker stop $$(docker ps -q --filter ancestor=localai-tests)
|
||||||
|
|
||||||
test-llama: prepare-test
|
|
||||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
|
||||||
|
|
||||||
test-llama-gguf: prepare-test
|
test-llama-gguf: prepare-test
|
||||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||||
|
@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
|
||||||
mkdir -p backend-assets/util/
|
mkdir -p backend-assets/util/
|
||||||
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
|
||||||
|
|
||||||
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
|
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
|
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
|
|
||||||
ifneq ($(UPX),)
|
|
||||||
$(UPX) backend-assets/grpc/llama-ggml
|
|
||||||
endif
|
|
||||||
|
|
||||||
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
|
||||||
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
|
||||||
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
|
||||||
|
|
|
@ -1,204 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
// This is a wrapper to statisfy the GRPC service interface
|
|
||||||
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/go-skynet/go-llama.cpp"
|
|
||||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
|
||||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
||||||
)
|
|
||||||
|
|
||||||
type LLM struct {
|
|
||||||
base.SingleThread
|
|
||||||
|
|
||||||
llama *llama.LLama
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Load(opts *pb.ModelOptions) error {
|
|
||||||
ropeFreqBase := float32(10000)
|
|
||||||
ropeFreqScale := float32(1)
|
|
||||||
|
|
||||||
if opts.RopeFreqBase != 0 {
|
|
||||||
ropeFreqBase = opts.RopeFreqBase
|
|
||||||
}
|
|
||||||
if opts.RopeFreqScale != 0 {
|
|
||||||
ropeFreqScale = opts.RopeFreqScale
|
|
||||||
}
|
|
||||||
|
|
||||||
llamaOpts := []llama.ModelOption{
|
|
||||||
llama.WithRopeFreqBase(ropeFreqBase),
|
|
||||||
llama.WithRopeFreqScale(ropeFreqScale),
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NGQA != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.RMSNormEps != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.ContextSize != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
|
|
||||||
}
|
|
||||||
if opts.F16Memory {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
|
|
||||||
}
|
|
||||||
if opts.Embeddings {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
|
|
||||||
}
|
|
||||||
if opts.NGPULayers != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
|
|
||||||
}
|
|
||||||
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
|
|
||||||
if opts.NBatch != 0 {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
|
|
||||||
} else {
|
|
||||||
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NUMA {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnableNUMA)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.LowVRAM {
|
|
||||||
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
|
|
||||||
}
|
|
||||||
|
|
||||||
model, err := llama.New(opts.ModelFile, llamaOpts...)
|
|
||||||
llm.llama = model
|
|
||||||
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
|
|
||||||
ropeFreqBase := float32(10000)
|
|
||||||
ropeFreqScale := float32(1)
|
|
||||||
|
|
||||||
if opts.RopeFreqBase != 0 {
|
|
||||||
ropeFreqBase = opts.RopeFreqBase
|
|
||||||
}
|
|
||||||
if opts.RopeFreqScale != 0 {
|
|
||||||
ropeFreqScale = opts.RopeFreqScale
|
|
||||||
}
|
|
||||||
predictOptions := []llama.PredictOption{
|
|
||||||
llama.SetTemperature(opts.Temperature),
|
|
||||||
llama.SetTopP(opts.TopP),
|
|
||||||
llama.SetTopK(int(opts.TopK)),
|
|
||||||
llama.SetTokens(int(opts.Tokens)),
|
|
||||||
llama.SetThreads(int(opts.Threads)),
|
|
||||||
llama.WithGrammar(opts.Grammar),
|
|
||||||
llama.SetRopeFreqBase(ropeFreqBase),
|
|
||||||
llama.SetRopeFreqScale(ropeFreqScale),
|
|
||||||
llama.SetNegativePromptScale(opts.NegativePromptScale),
|
|
||||||
llama.SetNegativePrompt(opts.NegativePrompt),
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.PromptCacheAll {
|
|
||||||
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.PromptCacheRO {
|
|
||||||
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Expected absolute path
|
|
||||||
if opts.PromptCachePath != "" {
|
|
||||||
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Mirostat != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.MirostatETA != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.MirostatTAU != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Debug {
|
|
||||||
predictOptions = append(predictOptions, llama.Debug)
|
|
||||||
}
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
|
|
||||||
|
|
||||||
if opts.PresencePenalty != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.NKeep != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Batch != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.F16KV {
|
|
||||||
predictOptions = append(predictOptions, llama.EnableF16KV)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.IgnoreEOS {
|
|
||||||
predictOptions = append(predictOptions, llama.IgnoreEOS)
|
|
||||||
}
|
|
||||||
|
|
||||||
if opts.Seed != 0 {
|
|
||||||
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
|
|
||||||
}
|
|
||||||
|
|
||||||
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
|
|
||||||
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
|
|
||||||
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
|
|
||||||
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
|
|
||||||
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
|
|
||||||
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
|
|
||||||
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
|
|
||||||
return predictOptions
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
|
|
||||||
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
|
||||||
predictOptions := buildPredictOptions(opts)
|
|
||||||
|
|
||||||
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
|
|
||||||
results <- token
|
|
||||||
return true
|
|
||||||
}))
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Println("err: ", err)
|
|
||||||
}
|
|
||||||
close(results)
|
|
||||||
}()
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
|
|
||||||
predictOptions := buildPredictOptions(opts)
|
|
||||||
|
|
||||||
if len(opts.EmbeddingTokens) > 0 {
|
|
||||||
tokens := []int{}
|
|
||||||
for _, t := range opts.EmbeddingTokens {
|
|
||||||
tokens = append(tokens, int(t))
|
|
||||||
}
|
|
||||||
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
|
|
||||||
}
|
|
||||||
|
|
||||||
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
|
|
||||||
}
|
|
|
@ -1,19 +0,0 @@
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"flag"
|
|
||||||
|
|
||||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
|
||||||
)
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||||
defaultTopP := 0.95
|
defaultTopP := 0.95
|
||||||
defaultTopK := 40
|
defaultTopK := 40
|
||||||
defaultTemp := 0.9
|
defaultTemp := 0.9
|
||||||
defaultMirostat := 2
|
// https://github.com/mudler/LocalAI/issues/2780
|
||||||
|
defaultMirostat := 0
|
||||||
defaultMirostatTAU := 5.0
|
defaultMirostatTAU := 5.0
|
||||||
defaultMirostatETA := 0.1
|
defaultMirostatETA := 0.1
|
||||||
defaultTypicalP := 1.0
|
defaultTypicalP := 1.0
|
||||||
|
|
|
@ -526,77 +526,6 @@ var _ = Describe("API test", func() {
|
||||||
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
|
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
|
||||||
})
|
})
|
||||||
|
|
||||||
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
|
|
||||||
if runtime.GOOS != "linux" {
|
|
||||||
Skip("test supported only on linux")
|
|
||||||
}
|
|
||||||
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
|
|
||||||
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
|
|
||||||
Name: "openllama_3b",
|
|
||||||
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
|
|
||||||
})
|
|
||||||
|
|
||||||
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
|
|
||||||
|
|
||||||
uuid := response["uuid"].(string)
|
|
||||||
|
|
||||||
Eventually(func() bool {
|
|
||||||
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
|
|
||||||
return response["processed"].(bool)
|
|
||||||
}, "360s", "10s").Should(Equal(true))
|
|
||||||
|
|
||||||
By("testing completion")
|
|
||||||
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(len(resp.Choices)).To(Equal(1))
|
|
||||||
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
|
|
||||||
|
|
||||||
By("testing functions")
|
|
||||||
resp2, err := client.CreateChatCompletion(
|
|
||||||
context.TODO(),
|
|
||||||
openai.ChatCompletionRequest{
|
|
||||||
Model: "openllama_3b",
|
|
||||||
Messages: []openai.ChatCompletionMessage{
|
|
||||||
{
|
|
||||||
Role: "user",
|
|
||||||
Content: "What is the weather like in San Francisco (celsius)?",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Functions: []openai.FunctionDefinition{
|
|
||||||
openai.FunctionDefinition{
|
|
||||||
Name: "get_current_weather",
|
|
||||||
Description: "Get the current weather",
|
|
||||||
Parameters: jsonschema.Definition{
|
|
||||||
Type: jsonschema.Object,
|
|
||||||
Properties: map[string]jsonschema.Definition{
|
|
||||||
"location": {
|
|
||||||
Type: jsonschema.String,
|
|
||||||
Description: "The city and state, e.g. San Francisco, CA",
|
|
||||||
},
|
|
||||||
"unit": {
|
|
||||||
Type: jsonschema.String,
|
|
||||||
Enum: []string{"celcius", "fahrenheit"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Required: []string{"location"},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(len(resp2.Choices)).To(Equal(1))
|
|
||||||
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
|
|
||||||
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
|
|
||||||
|
|
||||||
var res map[string]string
|
|
||||||
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
|
|
||||||
Expect(err).ToNot(HaveOccurred())
|
|
||||||
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
|
|
||||||
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
|
|
||||||
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
|
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
|
||||||
if runtime.GOOS != "linux" {
|
if runtime.GOOS != "linux" {
|
||||||
Skip("test supported only on linux")
|
Skip("test supported only on linux")
|
||||||
|
|
|
@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
|
||||||
|
|
||||||
{{% alert note %}}
|
{{% alert note %}}
|
||||||
|
|
||||||
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
|
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
|
||||||
|
|
||||||
{{% /alert %}}
|
{{% /alert %}}
|
||||||
|
|
||||||
|
@ -175,25 +175,12 @@ name: llama
|
||||||
backend: llama
|
backend: llama
|
||||||
parameters:
|
parameters:
|
||||||
# Relative to the models path
|
# Relative to the models path
|
||||||
model: file.gguf.bin
|
model: file.gguf
|
||||||
```
|
|
||||||
|
|
||||||
In the example above we specify `llama` as the backend to restrict loading `gguf` models only.
|
|
||||||
|
|
||||||
For instance, to use the `llama-ggml` backend for `ggml` models:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
name: llama
|
|
||||||
backend: llama-ggml
|
|
||||||
parameters:
|
|
||||||
# Relative to the models path
|
|
||||||
model: file.ggml.bin
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Reference
|
#### Reference
|
||||||
|
|
||||||
- [llama](https://github.com/ggerganov/llama.cpp)
|
- [llama](https://github.com/ggerganov/llama.cpp)
|
||||||
- [binding](https://github.com/go-skynet/go-llama.cpp)
|
|
||||||
|
|
||||||
|
|
||||||
### exllama/2
|
### exllama/2
|
||||||
|
|
|
@ -553,6 +553,29 @@
|
||||||
- filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
|
- filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
|
||||||
sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b
|
sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b
|
||||||
uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
|
uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
|
||||||
|
- !!merge <<: *llama33
|
||||||
|
name: "nohobby_l3.3-prikol-70b-v0.5"
|
||||||
|
icon: https://files.catbox.moe/x9t3zo.png
|
||||||
|
urls:
|
||||||
|
- https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.5
|
||||||
|
- https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF
|
||||||
|
description: |
|
||||||
|
99% of mergekit addicts quit before they hit it big.
|
||||||
|
|
||||||
|
Gosh, I need to create an org for my test runs - my profile looks like a dumpster.
|
||||||
|
|
||||||
|
What was it again? Ah, the new model.
|
||||||
|
|
||||||
|
Exactly what I wanted. All I had to do was yank out the cursed official DeepSeek distill and here we are.
|
||||||
|
|
||||||
|
From the brief tests it gave me some unusual takes on the character cards I'm used to. Just this makes it worth it imo. Also the writing is kinda nice.
|
||||||
|
overrides:
|
||||||
|
parameters:
|
||||||
|
model: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
|
||||||
|
files:
|
||||||
|
- filename: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
|
||||||
|
sha256: 36f29015f1f420f51569603445a3ea5fe72e3651c2022ef064086f5617578fe6
|
||||||
|
uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF/Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
|
||||||
- &rwkv
|
- &rwkv
|
||||||
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
|
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
|
||||||
name: "rwkv-6-world-7b"
|
name: "rwkv-6-world-7b"
|
||||||
|
@ -6890,6 +6913,60 @@
|
||||||
- filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
|
- filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
|
||||||
sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7
|
sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7
|
||||||
uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
|
uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
|
||||||
|
- !!merge <<: *mistral03
|
||||||
|
name: "cognitivecomputations_dolphin3.0-r1-mistral-24b"
|
||||||
|
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||||
|
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/hdAvdwZiJaLbGmvSZ3wTT.png
|
||||||
|
urls:
|
||||||
|
- https://huggingface.co/cognitivecomputations/Dolphin3.0-R1-Mistral-24B
|
||||||
|
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF
|
||||||
|
description: |
|
||||||
|
Dolphin 3.0 R1 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
|
||||||
|
overrides:
|
||||||
|
parameters:
|
||||||
|
model: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
|
||||||
|
files:
|
||||||
|
- filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
|
||||||
|
sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c
|
||||||
|
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
|
||||||
|
- !!merge <<: *mistral03
|
||||||
|
name: "cognitivecomputations_dolphin3.0-mistral-24b"
|
||||||
|
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||||
|
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/cNCs1TBD3FelWCJGkZ3cd.png
|
||||||
|
urls:
|
||||||
|
- https://huggingface.co/cognitivecomputations/Dolphin3.0-Mistral-24B
|
||||||
|
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF
|
||||||
|
description: |
|
||||||
|
Dolphin 3.0 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
|
||||||
|
overrides:
|
||||||
|
parameters:
|
||||||
|
model: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
|
||||||
|
files:
|
||||||
|
- filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
|
||||||
|
sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994
|
||||||
|
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
|
||||||
|
- !!merge <<: *mistral03
|
||||||
|
name: "sicariussicariistuff_redemption_wind_24b"
|
||||||
|
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||||
|
icon: https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B/resolve/main/Images/Redemption_Wind_24B.png
|
||||||
|
urls:
|
||||||
|
- https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B
|
||||||
|
- https://huggingface.co/bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF
|
||||||
|
description: |
|
||||||
|
This is a lightly fine-tuned version of the Mistral 24B base model, designed as an accessible and adaptable foundation for further fine-tuning and merging fodder. Key modifications include:
|
||||||
|
ChatML-ified, with no additional tokens introduced.
|
||||||
|
High quality private instruct—not generated by ChatGPT or Claude, ensuring no slop and good markdown understanding.
|
||||||
|
No refusals—since it’s a base model, refusals should be minimal to non-existent, though, in early testing, occasional warnings still appear (I assume some were baked into the pre-train).
|
||||||
|
High-quality private creative writing dataset Mainly to dilute baked-in slop further, but it can actually write some stories, not bad for loss ~8.
|
||||||
|
Small, high-quality private RP dataset This was done so further tuning for RP will be easier. The dataset was kept small and contains ZERO SLOP, some entries are of 16k token length.
|
||||||
|
Exceptional adherence to character cards This was done to make it easier for further tunes intended for roleplay.
|
||||||
|
overrides:
|
||||||
|
parameters:
|
||||||
|
model: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
|
||||||
|
files:
|
||||||
|
- filename: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
|
||||||
|
sha256: 40025eb00d83c9e9393555962962a2dfc5251fe7bd70812835ff0bcc55ecc463
|
||||||
|
uri: huggingface://bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF/SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
|
||||||
- &mudler
|
- &mudler
|
||||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
|
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
|
||||||
name: "LocalAI-llama3-8b-function-call-v0.2"
|
name: "LocalAI-llama3-8b-function-call-v0.2"
|
||||||
|
|
|
@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{
|
||||||
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||||
|
|
||||||
const (
|
const (
|
||||||
LlamaGGML = "llama-ggml"
|
|
||||||
|
|
||||||
LLamaCPP = "llama-cpp"
|
LLamaCPP = "llama-cpp"
|
||||||
|
|
||||||
LLamaCPPAVX2 = "llama-cpp-avx2"
|
LLamaCPPAVX2 = "llama-cpp-avx2"
|
||||||
|
@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
|
||||||
|
|
||||||
// sets a priority list - first has more priority
|
// sets a priority list - first has more priority
|
||||||
priorityList := []string{
|
priorityList := []string{
|
||||||
// First llama.cpp(variants) and llama-ggml to follow.
|
// First llama.cpp(variants)
|
||||||
// We keep the fallback to prevent that if the llama.cpp variants
|
// We keep the fallback to prevent that if the llama.cpp variants
|
||||||
// that depends on shared libs if breaks have still a safety net.
|
// that depends on shared libs if breaks have still a safety net.
|
||||||
LLamaCPP, LlamaGGML, LLamaCPPFallback,
|
LLamaCPP, LLamaCPPFallback,
|
||||||
}
|
}
|
||||||
|
|
||||||
toTheEnd := []string{
|
toTheEnd := []string{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue