Merge branch 'master' into feat-request-middleware

This commit is contained in:
Dave 2025-02-08 17:10:12 -05:00 committed by GitHub
commit 5b01500a60
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 87 additions and 350 deletions

View file

@ -6,9 +6,7 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true DETECT_LIBS?=true
# llama.cpp versions # llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp CPPLLAMA_VERSION?=e6e658319952f7ad269dc11275b9edddc721fc6d
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b
# whisper.cpp version # whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE= export STABLE_BUILD_TYPE=
export GGML_HIP=1 export GGML_HIP=1
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@ -222,19 +218,6 @@ endif
all: help all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp ## bark.cpp
sources/bark.cpp: sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@ -310,19 +293,17 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
replace: replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace: dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace prepare-sources: get-sources replace
$(GOCMD) mod download $(GOCMD) mod download
@ -330,7 +311,6 @@ prepare-sources: get-sources replace
## GENERIC ## GENERIC
rebuild: ## Rebuilds the project rebuild: ## Rebuilds the project
$(GOCMD) clean -cache $(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-piper clean $(MAKE) -C sources/go-piper clean
$(MAKE) build $(MAKE) build
@ -434,7 +414,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml: test-models/testmodel.ggml:
mkdir test-models mkdir test-models
mkdir test-dir mkdir test-dir
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
export GO_TAGS="tts debug" export GO_TAGS="tts debug"
$(MAKE) prepare-test $(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf $(MAKE) test-llama-gguf
$(MAKE) test-tts $(MAKE) test-tts
$(MAKE) test-stablediffusion $(MAKE) test-stablediffusion
@ -479,10 +458,6 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests) docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/ mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/

View file

@ -1,204 +0,0 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View file

@ -1,19 +0,0 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View file

@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultTopP := 0.95 defaultTopP := 0.95
defaultTopK := 40 defaultTopK := 40
defaultTemp := 0.9 defaultTemp := 0.9
defaultMirostat := 2 // https://github.com/mudler/LocalAI/issues/2780
defaultMirostat := 0
defaultMirostatTAU := 5.0 defaultMirostatTAU := 5.0
defaultMirostatETA := 0.1 defaultMirostatETA := 0.1
defaultTypicalP := 1.0 defaultTypicalP := 1.0

View file

@ -526,77 +526,6 @@ var _ = Describe("API test", func() {
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
}) })
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
uuid := response["uuid"].(string)
Eventually(func() bool {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
By("testing completion")
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1))
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
By("testing functions")
resp2, err := client.CreateChatCompletion(
context.TODO(),
openai.ChatCompletionRequest{
Model: "openllama_3b",
Messages: []openai.ChatCompletionMessage{
{
Role: "user",
Content: "What is the weather like in San Francisco (celsius)?",
},
},
Functions: []openai.FunctionDefinition{
openai.FunctionDefinition{
Name: "get_current_weather",
Description: "Get the current weather",
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"location": {
Type: jsonschema.String,
Description: "The city and state, e.g. San Francisco, CA",
},
"unit": {
Type: jsonschema.String,
Enum: []string{"celcius", "fahrenheit"},
},
},
Required: []string{"location"},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Choices)).To(Equal(1))
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
var res map[string]string
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
Expect(err).ToNot(HaveOccurred())
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
})
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() { It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
if runtime.GOOS != "linux" { if runtime.GOOS != "linux" {
Skip("test supported only on linux") Skip("test supported only on linux")

View file

@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
{{% alert note %}} {{% alert note %}}
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings. The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
{{% /alert %}} {{% /alert %}}
@ -175,25 +175,12 @@ name: llama
backend: llama backend: llama
parameters: parameters:
# Relative to the models path # Relative to the models path
model: file.gguf.bin model: file.gguf
```
In the example above we specify `llama` as the backend to restrict loading `gguf` models only.
For instance, to use the `llama-ggml` backend for `ggml` models:
```yaml
name: llama
backend: llama-ggml
parameters:
# Relative to the models path
model: file.ggml.bin
``` ```
#### Reference #### Reference
- [llama](https://github.com/ggerganov/llama.cpp) - [llama](https://github.com/ggerganov/llama.cpp)
- [binding](https://github.com/go-skynet/go-llama.cpp)
### exllama/2 ### exllama/2

View file

@ -553,6 +553,29 @@
- filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf - filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b
uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
- !!merge <<: *llama33
name: "nohobby_l3.3-prikol-70b-v0.5"
icon: https://files.catbox.moe/x9t3zo.png
urls:
- https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.5
- https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF
description: |
99% of mergekit addicts quit before they hit it big.
Gosh, I need to create an org for my test runs - my profile looks like a dumpster.
What was it again? Ah, the new model.
Exactly what I wanted. All I had to do was yank out the cursed official DeepSeek distill and here we are.
From the brief tests it gave me some unusual takes on the character cards I'm used to. Just this makes it worth it imo. Also the writing is kinda nice.
overrides:
parameters:
model: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
files:
- filename: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
sha256: 36f29015f1f420f51569603445a3ea5fe72e3651c2022ef064086f5617578fe6
uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF/Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
- &rwkv - &rwkv
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master" url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
name: "rwkv-6-world-7b" name: "rwkv-6-world-7b"
@ -6890,6 +6913,60 @@
- filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf - filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7 sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7
uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "cognitivecomputations_dolphin3.0-r1-mistral-24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/hdAvdwZiJaLbGmvSZ3wTT.png
urls:
- https://huggingface.co/cognitivecomputations/Dolphin3.0-R1-Mistral-24B
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF
description: |
Dolphin 3.0 R1 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
overrides:
parameters:
model: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
files:
- filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "cognitivecomputations_dolphin3.0-mistral-24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/cNCs1TBD3FelWCJGkZ3cd.png
urls:
- https://huggingface.co/cognitivecomputations/Dolphin3.0-Mistral-24B
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF
description: |
Dolphin 3.0 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
overrides:
parameters:
model: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
files:
- filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "sicariussicariistuff_redemption_wind_24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B/resolve/main/Images/Redemption_Wind_24B.png
urls:
- https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B
- https://huggingface.co/bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF
description: |
This is a lightly fine-tuned version of the Mistral 24B base model, designed as an accessible and adaptable foundation for further fine-tuning and merging fodder. Key modifications include:
ChatML-ified, with no additional tokens introduced.
High quality private instruct—not generated by ChatGPT or Claude, ensuring no slop and good markdown understanding.
No refusals—since its a base model, refusals should be minimal to non-existent, though, in early testing, occasional warnings still appear (I assume some were baked into the pre-train).
High-quality private creative writing dataset Mainly to dilute baked-in slop further, but it can actually write some stories, not bad for loss ~8.
Small, high-quality private RP dataset This was done so further tuning for RP will be easier. The dataset was kept small and contains ZERO SLOP, some entries are of 16k token length.
Exceptional adherence to character cards This was done to make it easier for further tunes intended for roleplay.
overrides:
parameters:
model: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
files:
- filename: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
sha256: 40025eb00d83c9e9393555962962a2dfc5251fe7bd70812835ff0bcc55ecc463
uri: huggingface://bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF/SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
- &mudler - &mudler
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
name: "LocalAI-llama3-8b-function-call-v0.2" name: "LocalAI-llama3-8b-function-call-v0.2"

View file

@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
const ( const (
LlamaGGML = "llama-ggml"
LLamaCPP = "llama-cpp" LLamaCPP = "llama-cpp"
LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX2 = "llama-cpp-avx2"
@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
// sets a priority list - first has more priority // sets a priority list - first has more priority
priorityList := []string{ priorityList := []string{
// First llama.cpp(variants) and llama-ggml to follow. // First llama.cpp(variants)
// We keep the fallback to prevent that if the llama.cpp variants // We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net. // that depends on shared libs if breaks have still a safety net.
LLamaCPP, LlamaGGML, LLamaCPPFallback, LLamaCPP, LLamaCPPFallback,
} }
toTheEnd := []string{ toTheEnd := []string{