From 1120847f7284c228b477feb36b99f9055984c2b4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 24 Aug 2023 01:18:58 +0200 Subject: [PATCH] feat: bump llama.cpp, add gguf support (#943) **Description** This PR syncs up the `llama` backend to use `gguf` (https://github.com/go-skynet/go-llama.cpp/pull/180). It also adds `llama-stable` to the targets so we can still load ggml. It adapts the current tests to use the `llama-backend` for ggml and uses a `gguf` model to run tests on the new backend. In order to consume the new version of go-llama.cpp, it also bump go to 1.21 (images, pipelines, etc) --------- Signed-off-by: Ettore Di Giacinto --- .github/workflows/release.yaml | 6 +++ .github/workflows/test.yml | 4 +- Dockerfile | 2 +- Makefile | 11 ++++-- api/api_test.go | 72 +++++++++++++++++++++++++++++++++- go.mod | 2 +- pkg/backend/llm/llama/llama.go | 8 ---- 7 files changed, 89 insertions(+), 16 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3a5dcf23..4a0e83e0 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -22,6 +22,9 @@ jobs: uses: actions/checkout@v3 with: submodules: true + - uses: actions/setup-go@v4 + with: + go-version: '>=1.21.0' - name: Dependencies run: | sudo apt-get update @@ -60,6 +63,9 @@ jobs: uses: actions/checkout@v3 with: submodules: true + - uses: actions/setup-go@v4 + with: + go-version: '>=1.21.0' - name: Build id: build env: diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7236c4b2..80bb7ab6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - go-version: ['1.20.x', 'stable'] + go-version: ['1.21.x'] steps: - name: Clone uses: actions/checkout@v3 @@ -63,7 +63,7 @@ jobs: runs-on: macOS-latest strategy: matrix: - go-version: ['1.20.x', 'stable'] + go-version: ['1.21.x'] steps: - name: Clone uses: actions/checkout@v3 diff --git a/Dockerfile b/Dockerfile index cc4c066c..ed54d1c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG GO_VERSION=1.20-bullseye +ARG GO_VERSION=1.21-bullseye FROM golang:$GO_VERSION as requirements diff --git a/Makefile b/Makefile index 97fcb3c4..91b10d74 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ GOVET=$(GOCMD) vet BINARY_NAME=local-ai # llama.cpp versions -GOLLAMA_VERSION?=f03869d188b72c8a617bea3a36cf8eb43f73445c +GOLLAMA_VERSION?=0ef04cde78e5da41de234832d73bb768ced709e7 GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7 @@ -103,7 +103,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts) OPTIONAL_GRPC+=backend-assets/grpc/piper endif -GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) +GRPC_BACKENDS?=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/falcon backend-assets/grpc/bloomz backend-assets/grpc/llama backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC) .PHONY: all test build vendor @@ -302,9 +302,10 @@ test: prepare test-models/testmodel grpcs export GO_TAGS="tts stablediffusion" $(MAKE) prepare-test HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/huggingface.py TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ - $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama" --flake-attempts 5 -v -r ./api ./pkg + $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf" --flake-attempts 5 -v -r ./api ./pkg $(MAKE) test-gpt4all $(MAKE) test-llama + $(MAKE) test-llama-gguf $(MAKE) test-tts $(MAKE) test-stablediffusion @@ -316,6 +317,10 @@ test-llama: prepare-test TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg +test-llama-gguf: prepare-test + TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ + $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg + test-tts: prepare-test TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg diff --git a/api/api_test.go b/api/api_test.go index ef1e980b..c36b692b 100644 --- a/api/api_test.go +++ b/api/api_test.go @@ -296,7 +296,7 @@ var _ = Describe("API test", func() { response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ URL: "github:go-skynet/model-gallery/openllama_3b.yaml", Name: "openllama_3b", - Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128}, + Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128}, }) Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) @@ -359,6 +359,76 @@ var _ = Describe("API test", func() { Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) }) + It("runs openllama gguf", Label("llama-gguf"), func() { + if runtime.GOOS != "linux" { + Skip("test supported only on linux") + } + response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ + URL: "github:go-skynet/model-gallery/openllama-3b-gguf.yaml", + Name: "openllama_3b_gguf", + Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128}, + }) + + Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response)) + + uuid := response["uuid"].(string) + + Eventually(func() bool { + response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid) + return response["processed"].(bool) + }, "360s", "10s").Should(Equal(true)) + + By("testing completion") + resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b_gguf", Prompt: "Count up to five: one, two, three, four, "}) + Expect(err).ToNot(HaveOccurred()) + Expect(len(resp.Choices)).To(Equal(1)) + Expect(resp.Choices[0].Text).To(ContainSubstring("five")) + + By("testing functions") + resp2, err := client.CreateChatCompletion( + context.TODO(), + openai.ChatCompletionRequest{ + Model: "openllama_3b_gguf", + Messages: []openai.ChatCompletionMessage{ + { + Role: "user", + Content: "What is the weather like in San Francisco (celsius)?", + }, + }, + Functions: []openai.FunctionDefinition{ + openai.FunctionDefinition{ + Name: "get_current_weather", + Description: "Get the current weather", + Parameters: jsonschema.Definition{ + Type: jsonschema.Object, + Properties: map[string]jsonschema.Definition{ + "location": { + Type: jsonschema.String, + Description: "The city and state, e.g. San Francisco, CA", + }, + "unit": { + Type: jsonschema.String, + Enum: []string{"celcius", "fahrenheit"}, + }, + }, + Required: []string{"location"}, + }, + }, + }, + }) + Expect(err).ToNot(HaveOccurred()) + Expect(len(resp2.Choices)).To(Equal(1)) + Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil()) + Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name) + + var res map[string]string + err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res) + Expect(err).ToNot(HaveOccurred()) + Expect(res["location"]).To(Equal("San Francisco, California"), fmt.Sprint(res)) + Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res)) + Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason)) + }) + It("runs gpt4all", Label("gpt4all"), func() { if runtime.GOOS != "linux" { Skip("test supported only on linux") diff --git a/go.mod b/go.mod index ab41e6a2..ab9de4d7 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/go-skynet/LocalAI -go 1.20 +go 1.21 require ( github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df diff --git a/pkg/backend/llm/llama/llama.go b/pkg/backend/llm/llama/llama.go index 62040233..c619f90e 100644 --- a/pkg/backend/llm/llama/llama.go +++ b/pkg/backend/llm/llama/llama.go @@ -32,14 +32,6 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error { llama.WithRopeFreqScale(ropeFreqScale), } - if opts.NGQA != 0 { - llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA))) - } - - if opts.RMSNormEps != 0 { - llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps)) - } - if opts.ContextSize != 0 { llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize))) }