mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 10:35:01 +00:00
WIP
This commit is contained in:
parent
029f97c2a2
commit
7437d0c9ca
6 changed files with 552 additions and 385 deletions
2
Makefile
2
Makefile
|
@ -6,7 +6,7 @@ BINARY_NAME=local-ai
|
|||
DETECT_LIBS?=true
|
||||
|
||||
# llama.cpp versions
|
||||
CPPLLAMA_VERSION?=de4c07f93783a1a96456a44dc16b9db538ee1618
|
||||
CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
|
||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||
## This is an hack for now, but it should be fixed in the future.
|
||||
set(TARGET myclip)
|
||||
add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_include_directories(myclip PUBLIC .)
|
||||
target_include_directories(myclip PUBLIC ../..)
|
||||
target_include_directories(myclip PUBLIC ../../common)
|
||||
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if (NOT MSVC)
|
||||
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
endif()
|
||||
# set(TARGET myclip)
|
||||
# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
|
||||
# install(TARGETS ${TARGET} LIBRARY)
|
||||
# target_include_directories(myclip PUBLIC .)
|
||||
# target_include_directories(myclip PUBLIC ../..)
|
||||
# target_include_directories(myclip PUBLIC ../../common)
|
||||
# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
# target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
# if (NOT MSVC)
|
||||
# target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
# endif()
|
||||
# END CLIP hack
|
||||
|
||||
|
||||
|
@ -75,7 +75,11 @@ add_library(hw_grpc_proto
|
|||
${hw_proto_hdrs} )
|
||||
|
||||
add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
|
||||
absl::flags_parse
|
||||
gRPC::${_REFLECTION}
|
||||
gRPC::${_GRPC_GRPCPP}
|
||||
|
|
|
@ -11,8 +11,7 @@
|
|||
#include <memory>
|
||||
#include <string>
|
||||
#include <getopt.h>
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "mtmd.h"
|
||||
#include "log.h"
|
||||
#include "stb_image.h"
|
||||
#include "common.h"
|
||||
|
@ -210,6 +209,8 @@ struct llama_client_slot
|
|||
int32_t num_prompt_tokens_processed = 0;
|
||||
|
||||
json prompt;
|
||||
json data;
|
||||
|
||||
std::string generated_text;
|
||||
llama_token sampled;
|
||||
std::vector<llama_token> cache_tokens;
|
||||
|
@ -239,7 +240,7 @@ struct llama_client_slot
|
|||
int32_t n_past_se = 0; // self-extend
|
||||
|
||||
// multimodal
|
||||
std::vector<slot_image> images;
|
||||
mtmd_context * mctx = nullptr;
|
||||
|
||||
// stats
|
||||
size_t sent_count = 0;
|
||||
|
@ -270,17 +271,6 @@ struct llama_client_slot
|
|||
n_past_se = 0;
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
{
|
||||
free(img.image_embedding);
|
||||
if (img.img_data) {
|
||||
clip_image_u8_free(img.img_data);
|
||||
}
|
||||
img.prefix_prompt = "";
|
||||
}
|
||||
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(common_params &global_params) {
|
||||
|
@ -456,6 +446,9 @@ struct llama_server_context
|
|||
llama_context *ctx = nullptr;
|
||||
const llama_vocab * vocab = nullptr;
|
||||
|
||||
// multimodal
|
||||
mtmd_context * mctx = nullptr;
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
common_params params;
|
||||
|
@ -494,6 +487,10 @@ struct llama_server_context
|
|||
|
||||
~llama_server_context()
|
||||
{
|
||||
if (mctx) {
|
||||
mtmd_free(mctx);
|
||||
mctx = nullptr;
|
||||
}
|
||||
if (ctx)
|
||||
{
|
||||
llama_free(ctx);
|
||||
|
@ -512,12 +509,14 @@ struct llama_server_context
|
|||
if (!params.mmproj.path.empty()) {
|
||||
multimodal = true;
|
||||
LOG_INFO("Multi Modal Mode Enabled", {});
|
||||
clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
|
||||
/* use_gpu */ has_gpu,
|
||||
/*verbosity=*/ GGML_LOG_LEVEL_INFO,
|
||||
});
|
||||
if(clp_ctx == nullptr) {
|
||||
LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
mparams.use_gpu = has_gpu;
|
||||
mparams.print_timings = false;
|
||||
mparams.n_threads = params.cpuparams.n_threads;
|
||||
mparams.verbosity = GGML_LOG_LEVEL_INFO;
|
||||
mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
|
||||
if (mctx == nullptr) {
|
||||
LOG_ERR("failed to load multimodal model, '%s'\n", params.mmproj.path.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -579,6 +578,8 @@ struct llama_server_context
|
|||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
slot.mctx = mctx;
|
||||
//slot.cache_tokens.has_mtmd = mctx != nullptr;
|
||||
|
||||
LOG_INFO("new slot", {
|
||||
{"slot_id", slot.id},
|
||||
|
@ -616,54 +617,61 @@ struct llama_server_context
|
|||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||
std::vector<server_tokens> tokenize(json &data, const json & json_prompt, bool add_bos) const
|
||||
{
|
||||
// TODO: currently, we tokenize using special tokens by default
|
||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||
const bool TMP_FORCE_SPECIAL = true;
|
||||
mtmd::bitmaps bitmaps;
|
||||
std::vector<server_tokens> inputs;
|
||||
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
||||
if (json_prompt.is_array())
|
||||
if (mctx != nullptr)
|
||||
{
|
||||
bool first = true;
|
||||
for (const auto& p : json_prompt)
|
||||
const auto &images_data = data.find("image_data");
|
||||
if (images_data != data.end() && images_data->is_array())
|
||||
{
|
||||
if (p.is_string())
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
auto s = p.template get<std::string>();
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_buffer.data(), image_buffer.size()));
|
||||
if (!bmp.ptr) {
|
||||
throw std::runtime_error("Failed to load image");
|
||||
}
|
||||
else
|
||||
{
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
// calculate bitmap hash (for KV caching)
|
||||
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
|
||||
bmp.set_id(hash.c_str());
|
||||
bitmaps.entries.push_back(std::move(bmp));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
|
||||
// multimodal
|
||||
std::string prompt_str = json_prompt.template get<std::string>();
|
||||
mtmd_input_text inp_txt = {
|
||||
prompt_str.c_str(),
|
||||
/* add_special */ true,
|
||||
/* parse_special */ true,
|
||||
};
|
||||
mtmd::input_chunks chunks(mtmd_input_chunks_init());
|
||||
auto bitmaps_c_ptr = bitmaps.c_ptr();
|
||||
int32_t tokenized = mtmd_tokenize(mctx,
|
||||
chunks.ptr.get(),
|
||||
&inp_txt,
|
||||
bitmaps_c_ptr.data(),
|
||||
bitmaps_c_ptr.size());
|
||||
if (tokenized != 0) {
|
||||
throw std::runtime_error("Failed to tokenize prompt");
|
||||
}
|
||||
|
||||
server_tokens tmp(chunks, true);
|
||||
inputs.push_back(std::move(tmp));
|
||||
} else {
|
||||
// non-multimodal version
|
||||
auto tokenized_prompts = tokenize_input_prompts(vocab, json_prompt, true, true);
|
||||
for (auto & p : tokenized_prompts) {
|
||||
auto tmp = server_tokens(p, mctx != nullptr);
|
||||
inputs.push_back(std::move(tmp));
|
||||
}
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
return inputs;
|
||||
}
|
||||
|
||||
llama_client_slot* get_slot(int id) {
|
||||
|
@ -716,6 +724,8 @@ struct llama_server_context
|
|||
slot->sparams.grammar_triggers = grammar_triggers;
|
||||
slot->sparams.grammar_lazy = grammar_lazy;
|
||||
|
||||
slot->data = data;
|
||||
|
||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||
// Might be better to reject the request with a 400 ?
|
||||
LOG_WARNING("Max tokens to predict exceeds server configuration", {
|
||||
|
@ -757,43 +767,7 @@ struct llama_server_context
|
|||
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
|
||||
}
|
||||
/*
|
||||
slot->sparams.penalty_prompt_tokens.clear();
|
||||
slot->sparams.use_penalty_prompt_tokens = false;
|
||||
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||
if (penalty_prompt != data.end())
|
||||
{
|
||||
if (penalty_prompt->is_string())
|
||||
{
|
||||
const auto penalty_prompt_string = penalty_prompt->get<std::string>();
|
||||
auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
|
||||
slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
|
||||
if (slot->params.n_predict > 0)
|
||||
{
|
||||
slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
|
||||
}
|
||||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
else if (penalty_prompt->is_array())
|
||||
{
|
||||
const auto n_tokens = penalty_prompt->size();
|
||||
slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
|
||||
const int n_vocab = llama_n_vocab(model);
|
||||
for (const auto &penalty_token : *penalty_prompt)
|
||||
{
|
||||
if (penalty_token.is_number_integer())
|
||||
{
|
||||
const auto tok = penalty_token.get<llama_token>();
|
||||
if (tok >= 0 && tok < n_vocab)
|
||||
{
|
||||
slot->sparams.penalty_prompt_tokens.push_back(tok);
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
slot->sparams.logit_bias.clear();
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
|
@ -869,79 +843,6 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
|
||||
if (multimodal)
|
||||
{
|
||||
const auto &images_data = data.find("image_data");
|
||||
if (images_data != data.end() && images_data->is_array())
|
||||
{
|
||||
for (const auto &img : *images_data)
|
||||
{
|
||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||
|
||||
slot_image img_sl;
|
||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||
img_sl.img_data = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||
{
|
||||
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
|
||||
__func__,
|
||||
slot->id,
|
||||
img_sl.id
|
||||
);
|
||||
return false;
|
||||
}
|
||||
LOG_VERBOSE("image loaded", {
|
||||
{"slot_id", slot->id},
|
||||
{"img_sl_id", img_sl.id}
|
||||
});
|
||||
img_sl.request_encode_image = true;
|
||||
slot->images.push_back(img_sl);
|
||||
}
|
||||
// process prompt
|
||||
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
|
||||
if (slot->images.size() > 0 && !slot->prompt.is_array())
|
||||
{
|
||||
std::string prompt = slot->prompt.get<std::string>();
|
||||
size_t pos = 0, begin_prefix = 0;
|
||||
std::string pattern = "[img-";
|
||||
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
|
||||
size_t end_prefix = pos;
|
||||
pos += pattern.length();
|
||||
size_t end_pos = prompt.find(']', pos);
|
||||
if (end_pos != std::string::npos)
|
||||
{
|
||||
std::string image_id = prompt.substr(pos, end_pos - pos);
|
||||
try
|
||||
{
|
||||
int img_id = std::stoi(image_id);
|
||||
bool found = false;
|
||||
for (slot_image &img : slot->images)
|
||||
{
|
||||
if (img.id == img_id) {
|
||||
found = true;
|
||||
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
|
||||
begin_prefix = end_pos + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
LOG("ERROR: Image with id: %i, not found.\n", img_id);
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
} catch (const std::invalid_argument& e) {
|
||||
LOG("Invalid image number id in prompt\n");
|
||||
slot->images.clear();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->prompt = "";
|
||||
slot->params.input_suffix = prompt.substr(begin_prefix);
|
||||
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
|
@ -1189,26 +1090,6 @@ struct llama_server_context
|
|||
return slot.has_next_token; // continue
|
||||
}
|
||||
|
||||
bool process_images(llama_client_slot &slot) const
|
||||
{
|
||||
for (slot_image &img : slot.images)
|
||||
{
|
||||
if (!img.request_encode_image)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||
LOG("Error processing the given image");
|
||||
return false;
|
||||
}
|
||||
|
||||
img.request_encode_image = false;
|
||||
}
|
||||
|
||||
return slot.images.size() > 0;
|
||||
}
|
||||
|
||||
void send_error(task_server& task, const std::string &error)
|
||||
{
|
||||
LOG("task %i - error: %s\n", task.id, error.c_str());
|
||||
|
@ -1451,74 +1332,6 @@ struct llama_server_context
|
|||
}
|
||||
}
|
||||
|
||||
// for multiple images processing
|
||||
bool ingest_images(llama_client_slot &slot, int n_batch)
|
||||
{
|
||||
int image_idx = 0;
|
||||
|
||||
while (image_idx < (int) slot.images.size())
|
||||
{
|
||||
slot_image &img = slot.images[image_idx];
|
||||
|
||||
// process prefix prompt
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
||||
{
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
llama_batch batch_view = {
|
||||
n_tokens,
|
||||
batch.token + i,
|
||||
nullptr,
|
||||
batch.pos + i,
|
||||
batch.n_seq_id + i,
|
||||
batch.seq_id + i,
|
||||
batch.logits + i,
|
||||
};
|
||||
if (llama_decode(ctx, batch_view))
|
||||
{
|
||||
LOG("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// process image with llm
|
||||
for (int i = 0; i < img.image_tokens; i += n_batch)
|
||||
{
|
||||
int n_eval = img.image_tokens - i;
|
||||
if (n_eval > n_batch)
|
||||
{
|
||||
n_eval = n_batch;
|
||||
}
|
||||
|
||||
const int n_embd = llama_model_n_embd(model);
|
||||
float * embd = img.image_embedding + i * n_embd;
|
||||
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
|
||||
if (llama_decode(ctx, llava_batch.batch))
|
||||
{
|
||||
LOG("%s : failed to eval image\n", __func__);
|
||||
return false;
|
||||
}
|
||||
slot.n_past += n_eval;
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
common_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
slot.params.input_suffix : // no more images, then process suffix prompt
|
||||
(json)(slot.images[image_idx].prefix_prompt);
|
||||
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void request_cancel(int task_id)
|
||||
{
|
||||
task_server task;
|
||||
|
@ -1733,7 +1546,7 @@ struct llama_server_context
|
|||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
// note: infill mode allows empty prompt
|
||||
|
@ -1750,7 +1563,7 @@ struct llama_server_context
|
|||
{
|
||||
slot.state = PROCESSING;
|
||||
slot.command = NONE;
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
std::vector<server_tokens> prompt_tokens;
|
||||
slot.t_start_process_prompt = ggml_time_us();
|
||||
slot.t_start_genereration = 0;
|
||||
|
||||
|
@ -1762,8 +1575,8 @@ struct llama_server_context
|
|||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
||||
auto prefix_tokens = tokenize(slot.data, slot.params.input_prefix, false);
|
||||
auto suffix_tokens = tokenize(slot.data, slot.params.input_suffix, false);
|
||||
|
||||
const int space_token = 29871; // TODO: this should not be hardcoded
|
||||
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
|
||||
|
@ -1779,7 +1592,7 @@ struct llama_server_context
|
|||
}
|
||||
else
|
||||
{
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
prompt_tokens = tokenize(slot.data, slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
|
@ -1892,18 +1705,36 @@ struct llama_server_context
|
|||
});
|
||||
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<server_tokens> prefix_tokens = prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
|
||||
// check if we should process the image
|
||||
if (slot.n_past < slot.n_prompt_tokens
|
||||
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
|
||||
// process the image
|
||||
int32_t new_n_past;
|
||||
int32_t res = prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
|
||||
int32_t n_pos = new_n_past - slot.n_past;
|
||||
if (res != 0) {
|
||||
slot.release();
|
||||
LOG_ERR("failed to process image, res = %d\n", res);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
slot.n_past += n_pos;
|
||||
// slot.n_prompt_tokens_processed += n_pos;
|
||||
}
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
||||
});
|
||||
|
||||
const bool has_images = process_images(slot);
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
|
||||
int32_t ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
|
@ -1923,19 +1754,6 @@ struct llama_server_context
|
|||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
{
|
||||
LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d",
|
||||
__func__,
|
||||
slot.id,
|
||||
slot.task_id
|
||||
);
|
||||
// FIXME @phymbert: to be properly tested
|
||||
// early returning without changing the slot state will block the slot for ever
|
||||
// no one at the moment is checking the return value
|
||||
return false;
|
||||
}
|
||||
|
||||
// extract the logits only for the last token
|
||||
if (batch.n_tokens > 0)
|
||||
{
|
||||
|
@ -2164,26 +1982,6 @@ static void start_llama_server() {
|
|||
json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
|
||||
{
|
||||
|
||||
// This is for example a slot data from the json data
|
||||
// slot->params.stream = json_value(data, "stream", false);
|
||||
// slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
// slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
// slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
// slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
// slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
// slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
// slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
// slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||
// slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||
// slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||
// slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
// slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
// slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
// slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
// slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
// slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
// slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
|
||||
// Create now a json data from the prediction options instead
|
||||
//
|
||||
json data;
|
||||
|
@ -2228,69 +2026,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
|||
return data;
|
||||
}
|
||||
|
||||
// static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
|
||||
// {
|
||||
// // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
|
||||
// gpt_params default_params;
|
||||
|
||||
// llama.stream = streaming;
|
||||
// llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
|
||||
// llama.params.sparams.top_k = predict->topk();
|
||||
// llama.params.sparams.top_p = predict->topp();
|
||||
// llama.params.sparams.typical_p = predict->typicalp();
|
||||
// llama.params.sparams.penalty_last_n = predict->repeat();
|
||||
// llama.params.sparams.temp = predict->temperature();
|
||||
// llama.params.sparams.penalty_repeat = predict->penalty();
|
||||
// llama.params.sparams.penalty_present = predict->presencepenalty();
|
||||
// llama.params.sparams.penalty_freq = predict->frequencypenalty();
|
||||
// llama.params.sparams.mirostat = predict->mirostat();
|
||||
// llama.params.sparams.mirostat_tau = predict->mirostattau();
|
||||
// llama.params.sparams.mirostat_eta = predict->mirostateta();
|
||||
// llama.params.n_keep = predict->nkeep();
|
||||
// llama.params.seed = predict->seed();
|
||||
// llama.params.sparams.grammar = predict->grammar();
|
||||
// // llama.params.n_probs = predict->
|
||||
// llama.params.prompt = predict->prompt();
|
||||
|
||||
// llama.params.sparams.logit_bias.clear();
|
||||
|
||||
// if (predict->ignoreeos())
|
||||
// {
|
||||
// llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
|
||||
// }
|
||||
|
||||
// // const auto &logit_bias = body.find("logit_bias");
|
||||
// // if (logit_bias != body.end() && logit_bias->is_array())
|
||||
// // {
|
||||
// // const int n_vocab = llama_n_vocab(llama.model);
|
||||
// // for (const auto &el : *logit_bias)
|
||||
// // {
|
||||
// // if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
||||
// // {
|
||||
// // llama_token tok = el[0].get<llama_token>();
|
||||
// // if (tok >= 0 && tok < n_vocab)
|
||||
// // {
|
||||
// // if (el[1].is_number())
|
||||
// // {
|
||||
// // llama.params.logit_bias[tok] = el[1].get<float>();
|
||||
// // }
|
||||
// // else if (el[1].is_boolean() && !el[1].get<bool>())
|
||||
// // {
|
||||
// // llama.params.logit_bias[tok] = -INFINITY;
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
// // }
|
||||
|
||||
// llama.params.antiprompt.clear();
|
||||
// for (const std::string& stopPrompt : predict->stopprompts()) {
|
||||
// if (!stopPrompt.empty())
|
||||
// {
|
||||
// llama.params.antiprompt.push_back(stopPrompt);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
|
@ -2603,10 +2338,10 @@ public:
|
|||
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
|
||||
json data = parse_options(false, request, llama);
|
||||
|
||||
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
|
||||
std::vector<server_tokens> tokens = llama.tokenize(data, data["prompt"],false);
|
||||
|
||||
for (int i=0 ; i< tokens.size(); i++){
|
||||
response->add_tokens(tokens[i]);
|
||||
response->add_tokens(tokens[i].llama_token);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
|
|
|
@ -20,9 +20,9 @@ fi
|
|||
|
||||
## XXX: In some versions of CMake clip wasn't being built before llama.
|
||||
## This is an hack for now, but it should be fixed in the future.
|
||||
cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
|
||||
cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
|
||||
cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
|
||||
echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
|
||||
cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
|
||||
cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
|
||||
# echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
|
||||
# cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
|
||||
# cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
|
427
backend/cpp/llama/utils.hpp
vendored
427
backend/cpp/llama/utils.hpp
vendored
|
@ -480,4 +480,431 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
|||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// tokenizer and input processing utils
|
||||
//
|
||||
|
||||
static bool json_is_array_of_numbers(const json & data) {
|
||||
if (data.is_array()) {
|
||||
for (const auto & e : data) {
|
||||
if (!e.is_number_integer()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// is array having BOTH numbers & strings?
|
||||
static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
||||
bool seen_string = false;
|
||||
bool seen_number = false;
|
||||
if (data.is_array()) {
|
||||
for (const auto & e : data) {
|
||||
seen_string |= e.is_string();
|
||||
seen_number |= e.is_number_integer();
|
||||
if (seen_number && seen_string) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// get value by path(key1 / key2)
|
||||
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
|
||||
json result = json::object();
|
||||
|
||||
for (const std::string & path : paths) {
|
||||
json current = js;
|
||||
const auto keys = string_split<std::string>(path, /*separator*/ '/');
|
||||
bool valid_path = true;
|
||||
for (const std::string & k : keys) {
|
||||
if (valid_path && current.is_object() && current.contains(k)) {
|
||||
current = current[k];
|
||||
} else {
|
||||
valid_path = false;
|
||||
}
|
||||
}
|
||||
if (valid_path) {
|
||||
result[path] = current;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* this handles 2 cases:
|
||||
* - only string, example: "string"
|
||||
* - mixed string and tokens, example: [12, 34, "string", 56, 78]
|
||||
*/
|
||||
static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
|
||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||
// or the first element of the json_prompt array is a string.
|
||||
llama_tokens prompt_tokens;
|
||||
|
||||
if (json_prompt.is_array()) {
|
||||
bool first = true;
|
||||
for (const auto & p : json_prompt) {
|
||||
if (p.is_string()) {
|
||||
auto s = p.template get<std::string>();
|
||||
|
||||
llama_tokens p;
|
||||
if (first) {
|
||||
p = common_tokenize(vocab, s, add_special, parse_special);
|
||||
first = false;
|
||||
} else {
|
||||
p = common_tokenize(vocab, s, false, parse_special);
|
||||
}
|
||||
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
} else {
|
||||
if (first) {
|
||||
first = false;
|
||||
}
|
||||
|
||||
prompt_tokens.push_back(p.template get<llama_token>());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* break the input "prompt" object into multiple prompt if needed, then tokenize them
|
||||
* this supports these cases:
|
||||
* - "prompt": "string"
|
||||
* - "prompt": [12, 34, 56]
|
||||
* - "prompt": [12, 34, "string", 56, 78]
|
||||
* and multiple prompts (multi-tasks):
|
||||
* - "prompt": ["string1", "string2"]
|
||||
* - "prompt": ["string1", [12, 34, 56]]
|
||||
* - "prompt": [[12, 34, 56], [78, 90, 12]]
|
||||
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
|
||||
*/
|
||||
static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
|
||||
std::vector<llama_tokens> result;
|
||||
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
|
||||
// string or mixed
|
||||
result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
|
||||
} else if (json_is_array_of_numbers(json_prompt)) {
|
||||
// array of tokens
|
||||
result.push_back(json_prompt.get<llama_tokens>());
|
||||
} else if (json_prompt.is_array()) {
|
||||
// array of prompts
|
||||
result.reserve(json_prompt.size());
|
||||
for (const auto & p : json_prompt) {
|
||||
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
|
||||
result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
|
||||
} else if (json_is_array_of_numbers(p)) {
|
||||
// array of tokens
|
||||
result.push_back(p.get<llama_tokens>());
|
||||
} else {
|
||||
throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
||||
}
|
||||
if (result.empty()) {
|
||||
throw std::runtime_error("\"prompt\" must not be empty");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// utils for interacting with libmtmd
|
||||
// (may need to refactor in near future)
|
||||
//
|
||||
|
||||
/**
|
||||
* server_tokens is a helper to manage the input tokens and image for the server.
|
||||
* it is made this way to simplify the logic of KV cache management.
|
||||
*/
|
||||
struct server_tokens {
|
||||
bool has_mtmd = false;
|
||||
|
||||
private: // disallow accessing these members directly, risking out-of-sync
|
||||
|
||||
// map a **start** position in tokens to the image chunk
|
||||
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
|
||||
|
||||
// list of tokens
|
||||
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
|
||||
// a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
|
||||
// important: for models using mrope, an image can contain multiple tokens but will use only one **position**
|
||||
llama_tokens tokens;
|
||||
|
||||
// for ex. with input of 5 text tokens and 2 images:
|
||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||
// pos 0 1 2 3 4 5 6 7 8 9
|
||||
// map_pos_to_image will contain: {5, img0}, {8, img1}
|
||||
|
||||
public:
|
||||
server_tokens() = default;
|
||||
~server_tokens() = default;
|
||||
|
||||
// Prevent copying
|
||||
server_tokens(const server_tokens&) = delete;
|
||||
server_tokens& operator=(const server_tokens&) = delete;
|
||||
|
||||
// Allow moving (usually implicitly generated if members are movable)
|
||||
server_tokens(server_tokens&&) = default;
|
||||
server_tokens& operator=(server_tokens&&) = default;
|
||||
|
||||
// Allow accessing elements using [] operator
|
||||
llama_token operator[](size_t index) { return tokens[index]; }
|
||||
const llama_token& operator[](size_t index) const { return tokens[index]; }
|
||||
|
||||
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
|
||||
for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
|
||||
push_back(mtmd_chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
|
||||
|
||||
// for debugging
|
||||
std::string str() const {
|
||||
std::ostringstream oss;
|
||||
oss << "tokens: ";
|
||||
for (const auto & t : tokens) {
|
||||
if (t == LLAMA_TOKEN_NULL) {
|
||||
oss << "<embd> ";
|
||||
} else {
|
||||
oss << t << " ";
|
||||
}
|
||||
}
|
||||
oss << "\n";
|
||||
oss << "image pos: ";
|
||||
for (const auto & it : map_pos_to_image) {
|
||||
oss << it.first << ", ";
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
|
||||
auto it = map_pos_to_image.find(pos);
|
||||
if (it != map_pos_to_image.end()) {
|
||||
return it->second;
|
||||
} else {
|
||||
throw std::runtime_error("Chunk not found");
|
||||
}
|
||||
}
|
||||
|
||||
void push_back(llama_token tok) {
|
||||
if (tok == LLAMA_TOKEN_NULL) {
|
||||
throw std::runtime_error("Invalid token");
|
||||
}
|
||||
tokens.emplace_back(tok);
|
||||
}
|
||||
|
||||
// will create a copy of the chunk if it contains non-text data
|
||||
void push_back(const mtmd_input_chunk * chunk) {
|
||||
auto type = mtmd_input_chunk_get_type(chunk);
|
||||
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
GGML_ASSERT(has_mtmd);
|
||||
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
||||
llama_pos start_pos = tokens.size();
|
||||
for (int i = 0; i < n_pos; ++i) {
|
||||
tokens.emplace_back(LLAMA_TOKEN_NULL);
|
||||
}
|
||||
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
|
||||
map_pos_to_image[start_pos] = std::move(new_chunk);
|
||||
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
size_t n_tokens;
|
||||
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
|
||||
for (size_t i = 0; i < n_tokens; ++i) {
|
||||
push_back(text_tokens[i]);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("Invalid chunk type");
|
||||
}
|
||||
}
|
||||
|
||||
// for compatibility with context shift and prompt truncation
|
||||
void insert(const llama_tokens & inp_tokens) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
|
||||
}
|
||||
|
||||
// for compatibility with speculative decoding, ctx shift, slot save/load
|
||||
const llama_tokens & get_text_tokens() const {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// for compatibility with speculative decoding
|
||||
void set_token(llama_pos pos, llama_token id) {
|
||||
GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
|
||||
tokens[pos] = id;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return tokens.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return tokens.empty();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
tokens.clear();
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
GGML_ASSERT(n <= tokens.size());
|
||||
if (has_mtmd) {
|
||||
// we throw an error if we try to remove a token in the middle of an image
|
||||
// for ex. with input of 5 text tokens and 2 images:
|
||||
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
|
||||
// n 1 2 3 4 5 6 7 8 9 10
|
||||
// allowed to resize ^ ^
|
||||
// disallowed to resize ^ ^ ^
|
||||
if (n > 0) {
|
||||
llama_token last_token = tokens[n - 1];
|
||||
// make sure we never remove tokens in the middle of an image
|
||||
if (last_token == LLAMA_TOKEN_NULL) {
|
||||
find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
|
||||
}
|
||||
}
|
||||
// remove all image chunks that are not used anymore
|
||||
for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
|
||||
llama_pos pos = it->first;
|
||||
if (pos >= (llama_pos)n) {
|
||||
it = map_pos_to_image.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens.resize(n);
|
||||
}
|
||||
|
||||
std::string detokenize(const llama_context * ctx, bool special) const {
|
||||
llama_tokens text_tokens;
|
||||
text_tokens.reserve(tokens.size());
|
||||
for (const auto & t : tokens) {
|
||||
if (t != LLAMA_TOKEN_NULL) {
|
||||
text_tokens.push_back(t);
|
||||
}
|
||||
}
|
||||
return common_detokenize(ctx, text_tokens, special);
|
||||
}
|
||||
|
||||
size_t get_common_prefix(const server_tokens & b) const {
|
||||
size_t max_idx = std::min(tokens.size(), b.tokens.size());
|
||||
for (size_t i = 0; i < max_idx; ++i) {
|
||||
auto & ai = tokens[i];
|
||||
auto & bi = b.tokens[i];
|
||||
|
||||
if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
|
||||
GGML_ASSERT(has_mtmd);
|
||||
const auto & a_chunk = find_chunk(i);
|
||||
const auto & b_chunk = b.find_chunk(i);
|
||||
GGML_ASSERT(a_chunk && b_chunk);
|
||||
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
|
||||
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
|
||||
std::string ai_id = mtmd_image_tokens_get_id(a_img);
|
||||
std::string bi_id = mtmd_image_tokens_get_id(b_img);
|
||||
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
|
||||
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
|
||||
if (ai_id == bi_id && a_pos == b_pos) {
|
||||
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
|
||||
i += a_pos - 1; // will be +1 by the for loop
|
||||
continue;
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
} else if (ai == bi) {
|
||||
continue;
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return max_idx; // all tokens are equal
|
||||
}
|
||||
|
||||
// make sure all text tokens are within the vocab range
|
||||
bool validate(const struct llama_context * ctx) const {
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
auto & t = tokens[i];
|
||||
if (t == LLAMA_TOKEN_NULL) {
|
||||
try {
|
||||
const auto & chunk = find_chunk(i);
|
||||
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
|
||||
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
|
||||
i += n_pos - 1; // will be +1 by the for loop
|
||||
} catch (const std::exception & e) {
|
||||
return false;
|
||||
}
|
||||
} else if (t < 0 || t >= n_vocab) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// encode and decode the image chunk
|
||||
int32_t process_chunk(
|
||||
llama_context * ctx,
|
||||
mtmd_context * mctx,
|
||||
llama_pos n_past,
|
||||
int32_t seq_id,
|
||||
llama_pos & n_pos_out) {
|
||||
auto it = map_pos_to_image.find(n_past);
|
||||
if (it == map_pos_to_image.end()) {
|
||||
throw std::runtime_error("Chunk not found");
|
||||
}
|
||||
// SRV_INF("%s\n", "processing image...");
|
||||
int32_t n_batch = llama_n_batch(ctx);
|
||||
int64_t t0 = ggml_time_ms();
|
||||
llama_pos new_n_past = n_past;
|
||||
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
|
||||
it->second.get(), // chunk
|
||||
n_past,
|
||||
seq_id,
|
||||
n_batch,
|
||||
true, // logits last
|
||||
&new_n_past);
|
||||
//SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
|
||||
if (result != 0) {
|
||||
LOG_ERR("mtmd_helper_eval failed with status %d", result);
|
||||
n_pos_out = n_past;
|
||||
return result;
|
||||
}
|
||||
n_pos_out = new_n_past;
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Computes FNV-1a hash of the data
|
||||
static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
const uint64_t fnv_prime = 0x100000001b3ULL;
|
||||
uint64_t hash = 0xcbf29ce484222325ULL;
|
||||
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
hash ^= data[i];
|
||||
hash *= fnv_prime;
|
||||
}
|
||||
return std::to_string(hash);
|
||||
}
|
|
@ -21,7 +21,8 @@ type MultimodalContent struct {
|
|||
ID int
|
||||
}
|
||||
|
||||
const DefaultMultiModalTemplate = "{{ range .Audio }}[audio-{{.ID}}]{{end}}{{ range .Images }}[img-{{.ID}}]{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
|
||||
// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42
|
||||
const DefaultMultiModalTemplate = "{{ range .Audio }}[audio-{{.ID}}]{{end}}{{ range .Images }}<__image__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
|
||||
|
||||
func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
|
||||
if templateString == "" {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue