From 695935c1841560fc7b2b1989576f8dc7fc1cb0e4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Feb 2025 16:44:31 +0100 Subject: [PATCH] chore(llama-ggml): drop deprecated backend The GGML format is now dead, since in the next version of LocalAI we already bring many breaking compatibility changes, taking the occasion also to drop ggml support (pre-gguf). Signed-off-by: Ettore Di Giacinto --- Makefile | 38 +--- backend/go/llm/llama-ggml/llama.go | 204 ------------------ backend/go/llm/llama-ggml/main.go | 19 -- core/http/app_test.go | 71 ------ docs/content/docs/features/text-generation.md | 17 +- pkg/model/initializers.go | 6 +- 6 files changed, 7 insertions(+), 348 deletions(-) delete mode 100644 backend/go/llm/llama-ggml/llama.go delete mode 100644 backend/go/llm/llama-ggml/main.go diff --git a/Makefile b/Makefile index 7edb6f6a..790c6e6d 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,6 @@ BINARY_NAME=local-ai DETECT_LIBS?=true # llama.cpp versions -GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp -GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b # whisper.cpp version @@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas) LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib export CXX=$(ROCM_HOME)/llvm/bin/clang++ export CC=$(ROCM_HOME)/llvm/bin/clang - # llama-ggml has no hipblas support, so override it here. export STABLE_BUILD_TYPE= export GGML_HIP=1 GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101 @@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback -ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper @@ -222,19 +218,6 @@ endif all: help -## go-llama.cpp -sources/go-llama.cpp: - mkdir -p sources/go-llama.cpp - cd sources/go-llama.cpp && \ - git init && \ - git remote add origin $(GOLLAMA_REPO) && \ - git fetch origin && \ - git checkout $(GOLLAMA_VERSION) && \ - git submodule update --init --recursive --depth 1 --single-branch - -sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp - $(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a - ## bark.cpp sources/bark.cpp: git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \ @@ -310,19 +293,17 @@ sources/whisper.cpp: sources/whisper.cpp/libwhisper.a: sources/whisper.cpp cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a -get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp +get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp replace: $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper - $(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp dropreplace: $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper - $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp prepare-sources: get-sources replace $(GOCMD) mod download @@ -330,7 +311,6 @@ prepare-sources: get-sources replace ## GENERIC rebuild: ## Rebuilds the project $(GOCMD) clean -cache - $(MAKE) -C sources/go-llama.cpp clean $(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/go-piper clean $(MAKE) build @@ -434,7 +414,7 @@ run: prepare ## run local-ai test-models/testmodel.ggml: mkdir test-models mkdir test-dir - wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml + wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav @@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs export GO_TAGS="tts debug" $(MAKE) prepare-test HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) - $(MAKE) test-llama + $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) $(MAKE) test-llama-gguf $(MAKE) test-tts $(MAKE) test-stablediffusion @@ -479,10 +458,6 @@ teardown-e2e: rm -rf $(TEST_DIR) || true docker stop $$(docker ps -q --filter ancestor=localai-tests) -test-llama: prepare-test - TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) - test-llama-gguf: prepare-test TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS) @@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc mkdir -p backend-assets/util/ cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server -backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc - CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ -ifneq ($(UPX),) - $(UPX) backend-assets/grpc/llama-ggml -endif - backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/ diff --git a/backend/go/llm/llama-ggml/llama.go b/backend/go/llm/llama-ggml/llama.go deleted file mode 100644 index 1a7add69..00000000 --- a/backend/go/llm/llama-ggml/llama.go +++ /dev/null @@ -1,204 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - "fmt" - - "github.com/go-skynet/go-llama.cpp" - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" -) - -type LLM struct { - base.SingleThread - - llama *llama.LLama -} - -func (llm *LLM) Load(opts *pb.ModelOptions) error { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - - llamaOpts := []llama.ModelOption{ - llama.WithRopeFreqBase(ropeFreqBase), - llama.WithRopeFreqScale(ropeFreqScale), - } - - if opts.NGQA != 0 { - llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA))) - } - - if opts.RMSNormEps != 0 { - llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps)) - } - - if opts.ContextSize != 0 { - llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) - } - if opts.F16Memory { - llamaOpts = append(llamaOpts, llama.EnableF16Memory) - } - if opts.Embeddings { - llamaOpts = append(llamaOpts, llama.EnableEmbeddings) - } - if opts.NGPULayers != 0 { - llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers))) - } - - llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap)) - llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU)) - llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit)) - if opts.NBatch != 0 { - llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch))) - } else { - llamaOpts = append(llamaOpts, llama.SetNBatch(512)) - } - - if opts.NUMA { - llamaOpts = append(llamaOpts, llama.EnableNUMA) - } - - if opts.LowVRAM { - llamaOpts = append(llamaOpts, llama.EnabelLowVRAM) - } - - model, err := llama.New(opts.ModelFile, llamaOpts...) - llm.llama = model - - return err -} - -func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption { - ropeFreqBase := float32(10000) - ropeFreqScale := float32(1) - - if opts.RopeFreqBase != 0 { - ropeFreqBase = opts.RopeFreqBase - } - if opts.RopeFreqScale != 0 { - ropeFreqScale = opts.RopeFreqScale - } - predictOptions := []llama.PredictOption{ - llama.SetTemperature(opts.Temperature), - llama.SetTopP(opts.TopP), - llama.SetTopK(int(opts.TopK)), - llama.SetTokens(int(opts.Tokens)), - llama.SetThreads(int(opts.Threads)), - llama.WithGrammar(opts.Grammar), - llama.SetRopeFreqBase(ropeFreqBase), - llama.SetRopeFreqScale(ropeFreqScale), - llama.SetNegativePromptScale(opts.NegativePromptScale), - llama.SetNegativePrompt(opts.NegativePrompt), - } - - if opts.PromptCacheAll { - predictOptions = append(predictOptions, llama.EnablePromptCacheAll) - } - - if opts.PromptCacheRO { - predictOptions = append(predictOptions, llama.EnablePromptCacheRO) - } - - // Expected absolute path - if opts.PromptCachePath != "" { - predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath)) - } - - if opts.Mirostat != 0 { - predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat))) - } - - if opts.MirostatETA != 0 { - predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA)) - } - - if opts.MirostatTAU != 0 { - predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU)) - } - - if opts.Debug { - predictOptions = append(predictOptions, llama.Debug) - } - - predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...)) - - if opts.PresencePenalty != 0 { - predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty)) - } - - if opts.NKeep != 0 { - predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep))) - } - - if opts.Batch != 0 { - predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch))) - } - - if opts.F16KV { - predictOptions = append(predictOptions, llama.EnableF16KV) - } - - if opts.IgnoreEOS { - predictOptions = append(predictOptions, llama.IgnoreEOS) - } - - if opts.Seed != 0 { - predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed))) - } - - //predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed)) - - predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty)) - predictOptions = append(predictOptions, llama.SetMlock(opts.MLock)) - predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap)) - predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU)) - predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit)) - predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ)) - predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP)) - return predictOptions -} - -func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) { - return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...) -} - -func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error { - predictOptions := buildPredictOptions(opts) - - predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool { - results <- token - return true - })) - - go func() { - _, err := llm.llama.Predict(opts.Prompt, predictOptions...) - if err != nil { - fmt.Println("err: ", err) - } - close(results) - }() - - return nil -} - -func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) { - predictOptions := buildPredictOptions(opts) - - if len(opts.EmbeddingTokens) > 0 { - tokens := []int{} - for _, t := range opts.EmbeddingTokens { - tokens = append(tokens, int(t)) - } - return llm.llama.TokenEmbeddings(tokens, predictOptions...) - } - - return llm.llama.Embeddings(opts.Embeddings, predictOptions...) -} diff --git a/backend/go/llm/llama-ggml/main.go b/backend/go/llm/llama-ggml/main.go deleted file mode 100644 index 544771db..00000000 --- a/backend/go/llm/llama-ggml/main.go +++ /dev/null @@ -1,19 +0,0 @@ -package main - -import ( - "flag" - - grpc "github.com/mudler/LocalAI/pkg/grpc" -) - -var ( - addr = flag.String("addr", "localhost:50051", "the address to connect to") -) - -func main() { - flag.Parse() - - if err := grpc.StartServer(*addr, &LLM{}); err != nil { - panic(err) - } -} diff --git a/core/http/app_test.go b/core/http/app_test.go index ca7a2eaa..ecaf6da3 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -526,77 +526,6 @@ var _ = Describe("API test", func() { Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) }) - It("runs openllama(llama-ggml backend)", Label("llama"), func() { - if runtime.GOOS != "linux" { - Skip("test supported only on linux") - } - response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ - URL: "github:go-skynet/model-gallery/openllama_3b.yaml", - Name: "openllama_3b", - Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128}, - }) - - Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) - - uuid := response["uuid"].(string) - - Eventually(func() bool { - response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid) - return response["processed"].(bool) - }, "360s", "10s").Should(Equal(true)) - - By("testing completion") - resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "}) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp.Choices)).To(Equal(1)) - Expect(resp.Choices[0].Text).To(ContainSubstring("five")) - - By("testing functions") - resp2, err := client.CreateChatCompletion( - context.TODO(), - openai.ChatCompletionRequest{ - Model: "openllama_3b", - Messages: []openai.ChatCompletionMessage{ - { - Role: "user", - Content: "What is the weather like in San Francisco (celsius)?", - }, - }, - Functions: []openai.FunctionDefinition{ - openai.FunctionDefinition{ - Name: "get_current_weather", - Description: "Get the current weather", - Parameters: jsonschema.Definition{ - Type: jsonschema.Object, - Properties: map[string]jsonschema.Definition{ - "location": { - Type: jsonschema.String, - Description: "The city and state, e.g. San Francisco, CA", - }, - "unit": { - Type: jsonschema.String, - Enum: []string{"celcius", "fahrenheit"}, - }, - }, - Required: []string{"location"}, - }, - }, - }, - }) - Expect(err).ToNot(HaveOccurred()) - Expect(len(resp2.Choices)).To(Equal(1)) - Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil()) - Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name) - - var res map[string]string - err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res) - Expect(err).ToNot(HaveOccurred()) - Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res)) - Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res)) - Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) - - }) - It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() { if runtime.GOOS != "linux" { Skip("test supported only on linux") diff --git a/docs/content/docs/features/text-generation.md b/docs/content/docs/features/text-generation.md index 11ab3999..342b8e76 100644 --- a/docs/content/docs/features/text-generation.md +++ b/docs/content/docs/features/text-generation.md @@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a {{% alert note %}} -The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings. +The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. {{% /alert %}} @@ -175,25 +175,12 @@ name: llama backend: llama parameters: # Relative to the models path - model: file.gguf.bin -``` - -In the example above we specify `llama` as the backend to restrict loading `gguf` models only. - -For instance, to use the `llama-ggml` backend for `ggml` models: - -```yaml -name: llama -backend: llama-ggml -parameters: - # Relative to the models path - model: file.ggml.bin + model: file.gguf ``` #### Reference - [llama](https://github.com/ggerganov/llama.cpp) -- [binding](https://github.com/go-skynet/go-llama.cpp) ### exllama/2 diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ace72fa3..5e465cf0 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{ var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true" const ( - LlamaGGML = "llama-ggml" - LLamaCPP = "llama-cpp" LLamaCPPAVX2 = "llama-cpp-avx2" @@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) { // sets a priority list - first has more priority priorityList := []string{ - // First llama.cpp(variants) and llama-ggml to follow. + // First llama.cpp(variants) // We keep the fallback to prevent that if the llama.cpp variants // that depends on shared libs if breaks have still a safety net. - LLamaCPP, LlamaGGML, LLamaCPPFallback, + LLamaCPP, LLamaCPPFallback, } toTheEnd := []string{