mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-22 03:24:59 +00:00
wip
This commit is contained in:
parent
cf0d23828e
commit
8b3c083c97
1 changed files with 188 additions and 185 deletions
|
@ -21,6 +21,10 @@
|
||||||
#include "backend.grpc.pb.h"
|
#include "backend.grpc.pb.h"
|
||||||
|
|
||||||
// include std::regex
|
// include std::regex
|
||||||
|
#include <cstddef>
|
||||||
|
#include <thread>
|
||||||
|
#include <mutex>
|
||||||
|
#include <chrono>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <grpcpp/ext/proto_server_reflection_plugin.h>
|
#include <grpcpp/ext/proto_server_reflection_plugin.h>
|
||||||
#include <grpcpp/grpcpp.h>
|
#include <grpcpp/grpcpp.h>
|
||||||
|
@ -1779,22 +1783,6 @@ static json format_detokenized_response(std::string content)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void log_server_request(const httplib::Request &req, const httplib::Response &res)
|
|
||||||
{
|
|
||||||
LOG_INFO("request", {
|
|
||||||
{"remote_addr", req.remote_addr},
|
|
||||||
{"remote_port", req.remote_port},
|
|
||||||
{"status", res.status},
|
|
||||||
{"method", req.method},
|
|
||||||
{"path", req.path},
|
|
||||||
{"params", req.params},
|
|
||||||
});
|
|
||||||
|
|
||||||
LOG_VERBOSE("request", {
|
|
||||||
{"request", req.body},
|
|
||||||
{"response", res.body},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
struct token_translator
|
struct token_translator
|
||||||
{
|
{
|
||||||
|
@ -1823,72 +1811,131 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
//////// LOCALAI
|
//////// LOCALAI
|
||||||
|
|
||||||
static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
|
|
||||||
|
|
||||||
|
json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
|
||||||
{
|
{
|
||||||
// https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
|
|
||||||
gpt_params default_params;
|
|
||||||
|
|
||||||
llama.stream = streaming;
|
// This is for example a slot data from the json data
|
||||||
llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
|
// slot->params.stream = json_value(data, "stream", false);
|
||||||
llama.params.sparams.top_k = predict->topk();
|
// slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
llama.params.sparams.top_p = predict->topp();
|
// slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
llama.params.sparams.tfs_z = predict->tailfreesamplingz();
|
// slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
llama.params.sparams.typical_p = predict->typicalp();
|
// slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
llama.params.sparams.penalty_last_n = predict->repeat();
|
// slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
llama.params.sparams.temp = predict->temperature();
|
// slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||||
llama.params.sparams.penalty_repeat = predict->penalty();
|
// slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
llama.params.sparams.penalty_present = predict->presencepenalty();
|
// slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||||
llama.params.sparams.penalty_freq = predict->frequencypenalty();
|
// slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||||
llama.params.sparams.mirostat = predict->mirostat();
|
// slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||||
llama.params.sparams.mirostat_tau = predict->mirostattau();
|
// slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||||
llama.params.sparams.mirostat_eta = predict->mirostateta();
|
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||||
llama.params.sparams.penalize_nl = predict->penalizenl();
|
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||||
llama.params.n_keep = predict->nkeep();
|
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||||
llama.params.seed = predict->seed();
|
// slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||||
llama.params.sparams.grammar = predict->grammar();
|
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||||
// llama.params.n_probs = predict->
|
// slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||||
llama.params.prompt = predict->prompt();
|
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
|
// slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
|
|
||||||
llama.params.sparams.logit_bias.clear();
|
// Create now a json data from the prediction options instead
|
||||||
|
//
|
||||||
|
json data;
|
||||||
|
data["stream"] = streaming;
|
||||||
|
data["cache_prompt"] = predict->promptcacheall();
|
||||||
|
data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||||
|
data["top_k"] = predict->topk();
|
||||||
|
data["top_p"] = predict->topp();
|
||||||
|
data["tfs_z"] = predict->tailfreesamplingz();
|
||||||
|
data["typical_p"] = predict->typicalp();
|
||||||
|
data["temperature"] = predict->temperature();
|
||||||
|
data["repeat_last_n"] = predict->repeat();
|
||||||
|
data["repeat_penalty"] = predict->penalty();
|
||||||
|
data["frequency_penalty"] = predict->frequencypenalty();
|
||||||
|
data["presence_penalty"] = predict->presencepenalty();
|
||||||
|
data["mirostat"] = predict->mirostat();
|
||||||
|
data["mirostat_tau"] = predict->mirostattau();
|
||||||
|
data["mirostat_eta"] = predict->mirostateta();
|
||||||
|
data["penalize_nl"] = predict->penalizenl();
|
||||||
|
data["n_keep"] = predict->nkeep();
|
||||||
|
data["seed"] = predict->seed();
|
||||||
|
data["grammar"] = predict->grammar();
|
||||||
|
data["prompt"] = predict->prompt();
|
||||||
|
data["ignore_eos"] = predict->ignoreeos();
|
||||||
|
|
||||||
if (predict->ignoreeos())
|
data["stop"] = predict->stopprompts();
|
||||||
{
|
// data["n_probs"] = predict->nprobs();
|
||||||
llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
|
//TODO: images,
|
||||||
}
|
|
||||||
|
|
||||||
// const auto &logit_bias = body.find("logit_bias");
|
return data;
|
||||||
// if (logit_bias != body.end() && logit_bias->is_array())
|
|
||||||
// {
|
|
||||||
// const int n_vocab = llama_n_vocab(llama.model);
|
|
||||||
// for (const auto &el : *logit_bias)
|
|
||||||
// {
|
|
||||||
// if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
|
||||||
// {
|
|
||||||
// llama_token tok = el[0].get<llama_token>();
|
|
||||||
// if (tok >= 0 && tok < n_vocab)
|
|
||||||
// {
|
|
||||||
// if (el[1].is_number())
|
|
||||||
// {
|
|
||||||
// llama.params.logit_bias[tok] = el[1].get<float>();
|
|
||||||
// }
|
|
||||||
// else if (el[1].is_boolean() && !el[1].get<bool>())
|
|
||||||
// {
|
|
||||||
// llama.params.logit_bias[tok] = -INFINITY;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
llama.params.antiprompt.clear();
|
|
||||||
for (const std::string& stopPrompt : predict->stopprompts()) {
|
|
||||||
if (!stopPrompt.empty())
|
|
||||||
{
|
|
||||||
llama.params.antiprompt.push_back(stopPrompt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
|
||||||
|
// {
|
||||||
|
// // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
|
||||||
|
// gpt_params default_params;
|
||||||
|
|
||||||
|
// llama.stream = streaming;
|
||||||
|
// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||||
|
// llama.params.sparams.top_k = predict->topk();
|
||||||
|
// llama.params.sparams.top_p = predict->topp();
|
||||||
|
// llama.params.sparams.tfs_z = predict->tailfreesamplingz();
|
||||||
|
// llama.params.sparams.typical_p = predict->typicalp();
|
||||||
|
// llama.params.sparams.penalty_last_n = predict->repeat();
|
||||||
|
// llama.params.sparams.temp = predict->temperature();
|
||||||
|
// llama.params.sparams.penalty_repeat = predict->penalty();
|
||||||
|
// llama.params.sparams.penalty_present = predict->presencepenalty();
|
||||||
|
// llama.params.sparams.penalty_freq = predict->frequencypenalty();
|
||||||
|
// llama.params.sparams.mirostat = predict->mirostat();
|
||||||
|
// llama.params.sparams.mirostat_tau = predict->mirostattau();
|
||||||
|
// llama.params.sparams.mirostat_eta = predict->mirostateta();
|
||||||
|
// llama.params.sparams.penalize_nl = predict->penalizenl();
|
||||||
|
// llama.params.n_keep = predict->nkeep();
|
||||||
|
// llama.params.seed = predict->seed();
|
||||||
|
// llama.params.sparams.grammar = predict->grammar();
|
||||||
|
// // llama.params.n_probs = predict->
|
||||||
|
// llama.params.prompt = predict->prompt();
|
||||||
|
|
||||||
|
// llama.params.sparams.logit_bias.clear();
|
||||||
|
|
||||||
|
// if (predict->ignoreeos())
|
||||||
|
// {
|
||||||
|
// llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // const auto &logit_bias = body.find("logit_bias");
|
||||||
|
// // if (logit_bias != body.end() && logit_bias->is_array())
|
||||||
|
// // {
|
||||||
|
// // const int n_vocab = llama_n_vocab(llama.model);
|
||||||
|
// // for (const auto &el : *logit_bias)
|
||||||
|
// // {
|
||||||
|
// // if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
||||||
|
// // {
|
||||||
|
// // llama_token tok = el[0].get<llama_token>();
|
||||||
|
// // if (tok >= 0 && tok < n_vocab)
|
||||||
|
// // {
|
||||||
|
// // if (el[1].is_number())
|
||||||
|
// // {
|
||||||
|
// // llama.params.logit_bias[tok] = el[1].get<float>();
|
||||||
|
// // }
|
||||||
|
// // else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||||
|
// // {
|
||||||
|
// // llama.params.logit_bias[tok] = -INFINITY;
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
// // }
|
||||||
|
|
||||||
|
// llama.params.antiprompt.clear();
|
||||||
|
// for (const std::string& stopPrompt : predict->stopprompts()) {
|
||||||
|
// if (!stopPrompt.empty())
|
||||||
|
// {
|
||||||
|
// llama.params.antiprompt.push_back(stopPrompt);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void params_parse(const backend::ModelOptions* request,
|
static void params_parse(const backend::ModelOptions* request,
|
||||||
|
@ -1904,6 +1951,7 @@ static void params_parse(const backend::ModelOptions* request,
|
||||||
params.n_threads = request->threads();
|
params.n_threads = request->threads();
|
||||||
params.n_gpu_layers = request->ngpulayers();
|
params.n_gpu_layers = request->ngpulayers();
|
||||||
params.n_batch = request->nbatch();
|
params.n_batch = request->nbatch();
|
||||||
|
params.n_parallel = 1;
|
||||||
// TODO: Add yarn
|
// TODO: Add yarn
|
||||||
|
|
||||||
if (!request->tensorsplit().empty()) {
|
if (!request->tensorsplit().empty()) {
|
||||||
|
@ -1937,12 +1985,11 @@ static void params_parse(const backend::ModelOptions* request,
|
||||||
params.embedding = request->embeddings();
|
params.embedding = request->embeddings();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The class has a llama instance that is shared across all RPCs
|
||||||
|
llama_server_context llama;
|
||||||
|
|
||||||
// GRPC Server start
|
// GRPC Server start
|
||||||
class BackendServiceImpl final : public backend::Backend::Service {
|
class BackendServiceImpl final : public backend::Backend::Service {
|
||||||
// The class has a llama instance that is shared across all RPCs
|
|
||||||
llama_server_context llama;
|
|
||||||
public:
|
public:
|
||||||
grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
|
grpc::Status Health(ServerContext* context, const backend::HealthMessage* request, backend::Reply* reply) {
|
||||||
// Implement Health RPC
|
// Implement Health RPC
|
||||||
|
@ -1970,126 +2017,61 @@ public:
|
||||||
return Status::OK;
|
return Status::OK;
|
||||||
}
|
}
|
||||||
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
|
grpc::Status PredictStream(grpc::ServerContext* context, const backend::PredictOptions* request, grpc::ServerWriter<backend::Reply>* writer) override {
|
||||||
// Implement the streaming logic here based on the request options
|
json data = parse_options(true, request, llama);
|
||||||
// You can use writer->Write(response) to send a reply to the client
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
// and return grpc::Status::OK when the operation is complete.
|
while (true)
|
||||||
auto lock = llama.lock();
|
{
|
||||||
|
task_result result = llama.next_result(task_id);
|
||||||
|
if (!result.error) {
|
||||||
|
const std::string str =
|
||||||
|
"data: " +
|
||||||
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
|
||||||
llama.rewind();
|
|
||||||
|
|
||||||
llama_reset_timings(llama.ctx);
|
|
||||||
|
|
||||||
parse_options_completion(false, request, llama);
|
|
||||||
|
|
||||||
llama.initSampling();
|
|
||||||
llama.loadPrompt(request->prompt());
|
|
||||||
llama.beginCompletion();
|
|
||||||
size_t sent_count = 0;
|
|
||||||
size_t sent_token_probs_index = 0;
|
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
|
||||||
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
|
||||||
|
|
||||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
|
||||||
|
|
||||||
const std::string str_test = llama.generated_text.substr(pos);
|
|
||||||
bool is_stop_full = false;
|
|
||||||
size_t stop_pos =
|
|
||||||
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
|
||||||
if (stop_pos != std::string::npos) {
|
|
||||||
is_stop_full = true;
|
|
||||||
llama.generated_text.erase(
|
|
||||||
llama.generated_text.begin() + pos + stop_pos,
|
|
||||||
llama.generated_text.end());
|
|
||||||
pos = std::min(sent_count, llama.generated_text.size());
|
|
||||||
} else {
|
|
||||||
is_stop_full = false;
|
|
||||||
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
|
|
||||||
STOP_PARTIAL);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
stop_pos == std::string::npos ||
|
|
||||||
// Send rest of the text if we are at the end of the generation
|
|
||||||
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
|
|
||||||
) {
|
|
||||||
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
|
|
||||||
|
|
||||||
sent_count += to_send.size();
|
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
|
||||||
|
|
||||||
if (llama.params.sparams.n_probs > 0) {
|
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
|
||||||
if (probs_pos < probs_stop_pos) {
|
|
||||||
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
|
||||||
}
|
|
||||||
sent_token_probs_index = probs_stop_pos;
|
|
||||||
}
|
|
||||||
backend::Reply reply;
|
backend::Reply reply;
|
||||||
reply.set_message(to_send);
|
reply.set_message(str.c_str());
|
||||||
|
|
||||||
// Send the reply
|
// Send the reply
|
||||||
writer->Write(reply);
|
writer->Write(reply);
|
||||||
|
|
||||||
|
if (result.stop) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return grpc::Status::OK;
|
||||||
|
|
||||||
|
|
||||||
|
// auto on_complete = [task_id, &llama] (bool)
|
||||||
|
// {
|
||||||
|
// // cancel
|
||||||
|
// llama.request_cancel(task_id);
|
||||||
|
// };
|
||||||
|
|
||||||
|
|
||||||
llama_print_timings(llama.ctx);
|
|
||||||
|
|
||||||
llama.mutex.unlock();
|
|
||||||
lock.release();
|
|
||||||
return grpc::Status::OK;
|
return grpc::Status::OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
|
grpc::Status Predict(ServerContext* context, const backend::PredictOptions* request, backend::Reply* reply) {
|
||||||
auto lock = llama.lock();
|
json data = parse_options(true, request, llama);
|
||||||
llama.rewind();
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
llama_reset_timings(llama.ctx);
|
std::string completion_text;
|
||||||
parse_options_completion(false, request, llama);
|
task_result result = llama.next_result(task_id);
|
||||||
|
if (!result.error && result.stop) {
|
||||||
llama.initSampling();
|
reply->set_message(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace));
|
||||||
llama.loadPrompt(request->prompt());
|
}
|
||||||
llama.beginCompletion();
|
else
|
||||||
|
{
|
||||||
if (llama.params.n_beams) {
|
return grpc::Status::OK;
|
||||||
// Fill llama.generated_token_probs vector with final beam.
|
|
||||||
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
|
|
||||||
llama.n_past, llama.n_remain);
|
|
||||||
// Translate llama.generated_token_probs to llama.generated_text.
|
|
||||||
append_to_generated_text_from_generated_token_probs(llama);
|
|
||||||
} else {
|
|
||||||
size_t stop_pos = std::string::npos;
|
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
|
||||||
|
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
|
||||||
token_text.size(), STOP_FULL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stop_pos == std::string::npos) {
|
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
|
|
||||||
}
|
|
||||||
if (stop_pos != std::string::npos) {
|
|
||||||
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
|
|
||||||
llama.generated_text.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto probs = llama.generated_token_probs;
|
|
||||||
if (llama.params.sparams.n_probs > 0 && llama.stopped_word) {
|
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
|
||||||
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
|
||||||
}
|
|
||||||
reply->set_message(llama.generated_text);
|
|
||||||
return grpc::Status::OK;
|
return grpc::Status::OK;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -2129,6 +2111,27 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// run the HTTP server in a thread - see comment below
|
||||||
|
std::thread t([&]()
|
||||||
|
{
|
||||||
RunServer(server_address);
|
RunServer(server_address);
|
||||||
return 0;
|
return 0;
|
||||||
|
});
|
||||||
|
|
||||||
|
{
|
||||||
|
bool running = true;
|
||||||
|
while (running)
|
||||||
|
{
|
||||||
|
running = llama.update_slots();
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(1));
|
||||||
|
// print state
|
||||||
|
std::cout << running << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//);
|
||||||
|
|
||||||
|
t.join();
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue