feat(llama.cpp): support embeddings endpoints (#2871)

* feat(llama.cpp): add embeddings

Also enable embeddings by default for llama.cpp models

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(Makefile): prepare llama.cpp sources only once

Otherwise we keep cloning llama.cpp for each of the variants

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* do not set embeddings to false

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs: add embeddings to the YAML config reference

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-07-15 22:54:16 +02:00 committed by GitHub
parent 6564e7ea01
commit 35561edb6e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 44 additions and 12 deletions

View file

@ -2108,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
data["grammar"] = predict->grammar();
data["prompt"] = predict->prompt();
data["ignore_eos"] = predict->ignoreeos();
data["embeddings"] = predict->embeddings();
// for each image in the request, add the image data
//
@ -2385,6 +2386,31 @@ public:
return grpc::Status::OK;
}
/// https://github.com/ggerganov/llama.cpp/blob/aa2341298924ac89778252015efcb792f2df1e20/examples/server/server.cpp#L2969
grpc::Status Embedding(ServerContext* context, const backend::PredictOptions* request, backend::EmbeddingResult* embeddingResult) {
json data = parse_options(false, request, llama);
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);
//std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
llama.queue_results.remove_waiting_task_id(task_id);
if (!result.error && result.stop) {
std::vector<float> embeddings = result.result_json.value("embedding", std::vector<float>());
// loop the vector and set the embeddings results
for (int i = 0; i < embeddings.size(); i++) {
embeddingResult->add_embeddings(embeddings[i]);
}
}
else
{
return grpc::Status::OK;
}
return grpc::Status::OK;
}
};
void RunServer(const std::string& server_address) {