chore(llama-ggml): drop deprecated backend

The GGML format is now dead, since in the next version of LocalAI we already bring many breaking compatibility changes, taking the occasion also to drop ggml support (pre-gguf). Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-06-27 05:04:59 +00:00 · 2025-02-06 16:44:31 +01:00 · 2025-02-06 16:44:31 +01:00 · 695935c184
commit 695935c184
parent 8d45670e41
6 changed files with 7 additions and 348 deletions
--- a/38
+++ b/38
@ -6,8 +6,6 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
 CPPLLAMA_VERSION?=d774ab3acc4fee41fbed6dbfc192b57d5f79f34b
 # whisper.cpp version
@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@ -222,19 +218,6 @@ endif
 all: help
 ## go-llama.cpp
 sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
 	cd sources/go-llama.cpp && \
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@ -310,19 +293,17 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@ -330,7 +311,6 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@ -434,7 +414,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@ -479,10 +458,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@ -1,204 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
 	base.SingleThread
 	llama *llama.LLama
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	llamaOpts := []llama.ModelOption{
 		llama.WithRopeFreqBase(ropeFreqBase),
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
 	if opts.F16Memory {
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
 	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
 	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
 	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
 	if opts.NBatch != 0 {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
 	} else {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
 	}
 	if opts.NUMA {
 		llamaOpts = append(llamaOpts, llama.EnableNUMA)
 	}
 	if opts.LowVRAM {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	predictOptions := []llama.PredictOption{
 		llama.SetTemperature(opts.Temperature),
 		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
 		llama.SetRopeFreqBase(ropeFreqBase),
 		llama.SetRopeFreqScale(ropeFreqScale),
 		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 	if opts.PromptCacheAll {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
 	}
 	if opts.PromptCacheRO {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
 	}
 	// Expected absolute path
 	if opts.PromptCachePath != "" {
 		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
 	}
 	if opts.Mirostat != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
 	}
 	if opts.MirostatETA != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 	if opts.MirostatTAU != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 	if opts.Debug {
 		predictOptions = append(predictOptions, llama.Debug)
 	}
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 	if opts.PresencePenalty != 0 {
 		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 	if opts.NKeep != 0 {
 		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
 	}
 	if opts.F16KV {
 		predictOptions = append(predictOptions, llama.EnableF16KV)
 	}
 	if opts.IgnoreEOS {
 		predictOptions = append(predictOptions, llama.IgnoreEOS)
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
 	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
 	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
 		results <- token
 		return true
 	}))
 	go func() {
 		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	predictOptions := buildPredictOptions(opts)
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
 	}
 	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
 }
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@ -1,19 +0,0 @@
 package main
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@ -526,77 +526,6 @@ var _ = Describe("API test", func() {
 				Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
 			})
 			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
 					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
 				uuid := response["uuid"].(string)
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))
 				By("testing completion")
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: "openllama_3b",
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
 								Content: "What is the weather like in San Francisco (celsius)?",
 							},
 						},
 						Functions: []openai.FunctionDefinition{
 							openai.FunctionDefinition{
 								Name:        "get_current_weather",
 								Description: "Get the current weather",
 								Parameters: jsonschema.Definition{
 									Type: jsonschema.Object,
 									Properties: map[string]jsonschema.Definition{
 										"location": {
 											Type:        jsonschema.String,
 											Description: "The city and state, e.g. San Francisco, CA",
 										},
 										"unit": {
 											Type: jsonschema.String,
 											Enum: []string{"celcius", "fahrenheit"},
 										},
 									},
 									Required: []string{"location"},
 								},
 							},
 						},
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
 {{% alert note %}}
-The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
+The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
 {{% /alert %}}
@ -175,25 +175,12 @@ name: llama
 backend: llama
 parameters:
  # Relative to the models path
-  model: file.gguf.bin
+  model: file.gguf
 ```
 In the example above we specify `llama` as the backend to restrict loading `gguf` models only. 
 For instance, to use the `llama-ggml` backend for `ggml` models:
 ```yaml
 name: llama
 backend: llama-ggml
 parameters:
  # Relative to the models path
  model: file.ggml.bin
 ```
 #### Reference
 - [llama](https://github.com/ggerganov/llama.cpp)
 - [binding](https://github.com/go-skynet/go-llama.cpp)
 ### exllama/2
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{
 var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
 const (
 	LlamaGGML = "llama-ggml"
 	LLamaCPP = "llama-cpp"
 	LLamaCPPAVX2     = "llama-cpp-avx2"
@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
 	// sets a priority list - first has more priority
 	priorityList := []string{
-		// First llama.cpp(variants) and llama-ggml to follow.
+		// First llama.cpp(variants)
 		// We keep the fallback to prevent that if the llama.cpp variants
 		// that depends on shared libs if breaks have still a safety net.
-		LLamaCPP, LlamaGGML, LLamaCPPFallback,
+		LLamaCPP, LLamaCPPFallback,
 	}
 	toTheEnd := []string{