diff --git a/Makefile b/Makefile index 88a2b283..1b1dc248 100644 --- a/Makefile +++ b/Makefile @@ -310,7 +310,7 @@ sources/whisper.cpp: sources/whisper.cpp/libwhisper.a: sources/whisper.cpp cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a -get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream +get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp replace: $(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp @@ -767,28 +767,28 @@ else endif # This target is for manually building a variant with-auto detected flags -backend-assets/grpc/llama-cpp: backend-assets/grpc +backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-cpp $(MAKE) -C backend/cpp/llama-cpp purge $(info ${GREEN}I llama-cpp build info:avx2${RESET}) $(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp -backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc +backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-avx2 $(MAKE) -C backend/cpp/llama-avx2 purge $(info ${GREEN}I llama-cpp build info:avx2${RESET}) CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2 -backend-assets/grpc/llama-cpp-avx: backend-assets/grpc +backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-avx $(MAKE) -C backend/cpp/llama-avx purge $(info ${GREEN}I llama-cpp build info:avx${RESET}) CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx -backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc +backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-fallback $(MAKE) -C backend/cpp/llama-fallback purge $(info ${GREEN}I llama-cpp build info:fallback${RESET}) @@ -799,35 +799,35 @@ ifeq ($(BUILD_TYPE),metal) cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/ endif -backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc +backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-cuda $(MAKE) -C backend/cpp/llama-cuda purge $(info ${GREEN}I llama-cpp build info:cuda${RESET}) CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda -backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc +backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-hipblas $(MAKE) -C backend/cpp/llama-hipblas purge $(info ${GREEN}I llama-cpp build info:hipblas${RESET}) BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas -backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc +backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16 $(MAKE) -C backend/cpp/llama-sycl_f16 purge $(info ${GREEN}I llama-cpp build info:sycl_f16${RESET}) BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16 -backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc +backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32 $(MAKE) -C backend/cpp/llama-sycl_f32 purge $(info ${GREEN}I llama-cpp build info:sycl_f32${RESET}) BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32 -backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc +backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp cp -rf backend/cpp/llama backend/cpp/llama-grpc $(MAKE) -C backend/cpp/llama-grpc purge $(info ${GREEN}I llama-cpp build info:grpc${RESET}) diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index aa9a9497..1cff6b8a 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2108,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama data["grammar"] = predict->grammar(); data["prompt"] = predict->prompt(); data["ignore_eos"] = predict->ignoreeos(); + data["embeddings"] = predict->embeddings(); // for each image in the request, add the image data // @@ -2385,6 +2386,31 @@ public: return grpc::Status::OK; } + + /// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969 + grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) { + json data = parse_options(false, request, llama); + const int task_id = llama.queue_tasks.get_new_id(); + llama.queue_results.add_waiting_task_id(task_id); + llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1); + // get the result + task_result result = llama.queue_results.recv(task_id); + //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl; + llama.queue_results.remove_waiting_task_id(task_id); + if (!result.error && result.stop) { + std::vector embeddings = result.result_json.value("embedding", std::vector()); + // loop the vector and set the embeddings results + for (int i = 0; i < embeddings.size(); i++) { + embeddingResult->add_embeddings(embeddings[i]); + } + } + else + { + return grpc::Status::OK; + } + + return grpc::Status::OK; + } }; void RunServer(const std::string& server_address) { diff --git a/core/backend/options.go b/core/backend/options.go index e6ce87eb..d986b8e6 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -91,7 +91,7 @@ func gRPCModelOpts(c config.BackendConfig) *pb.ModelOptions { Type: c.ModelType, RopeFreqScale: c.RopeFreqScale, NUMA: c.NUMA, - Embeddings: c.Embeddings, + Embeddings: *c.Embeddings, LowVRAM: *c.LowVRAM, NGPULayers: int32(*c.NGPULayers), MMap: *c.MMap, diff --git a/core/config/backend_config.go b/core/config/backend_config.go index 1e647ceb..383686cd 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -32,7 +32,7 @@ type BackendConfig struct { Threads *int `yaml:"threads"` Debug *bool `yaml:"debug"` Roles map[string]string `yaml:"roles"` - Embeddings bool `yaml:"embeddings"` + Embeddings *bool `yaml:"embeddings"` Backend string `yaml:"backend"` TemplateConfig TemplateConfig `yaml:"template"` @@ -338,6 +338,10 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) { cfg.LowVRAM = &falseV } + if cfg.Embeddings == nil { + cfg.Embeddings = &falseV + } + // Value passed by the top level are treated as default (no implicit defaults) // defaults are set by the user if ctx == 0 { diff --git a/docs/content/docs/advanced/advanced-usage.md b/docs/content/docs/advanced/advanced-usage.md index 4bbd8a30..a5f47865 100644 --- a/docs/content/docs/advanced/advanced-usage.md +++ b/docs/content/docs/advanced/advanced-usage.md @@ -112,6 +112,8 @@ name: "" # Model name, used to identify the model in API calls. # Precision settings for the model, reducing precision can enhance performance on some hardware. f16: null # Whether to use 16-bit floating-point precision. +embeddings: true # Enable embeddings for the model. + # Concurrency settings for the application. threads: null # Number of threads to use for processing.