mirror of
https://github.com/mudler/LocalAI.git
synced 2025-06-30 06:30:43 +00:00
Merge branch 'master' into fix-pr-folder-tasks
Signed-off-by: Dave Lee <dave@gray101.com>
This commit is contained in:
commit
505fe7da32
34 changed files with 344 additions and 366 deletions
2
.github/workflows/bump_deps.yaml
vendored
2
.github/workflows/bump_deps.yaml
vendored
|
@ -56,7 +56,7 @@ jobs:
|
|||
rm -rfv ${{ matrix.variable }}_message.txt
|
||||
rm -rfv ${{ matrix.variable }}_commit.txt
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
|
2
.github/workflows/bump_docs.yaml
vendored
2
.github/workflows/bump_docs.yaml
vendored
|
@ -17,7 +17,7 @@ jobs:
|
|||
run: |
|
||||
bash .github/bump_docs.sh ${{ matrix.repository }}
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
|
2
.github/workflows/checksum_checker.yaml
vendored
2
.github/workflows/checksum_checker.yaml
vendored
|
@ -36,7 +36,7 @@ jobs:
|
|||
sudo chmod 777 /hf_cache
|
||||
bash .github/checksum_checker.sh gallery/index.yaml
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
|
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
|||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@master
|
||||
uses: securego/gosec@v2.21.0
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
|
2
.github/workflows/update_swagger.yaml
vendored
2
.github/workflows/update_swagger.yaml
vendored
|
@ -25,7 +25,7 @@ jobs:
|
|||
run: |
|
||||
make protogen-go swagger
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v6
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
|
|
30
Dockerfile
30
Dockerfile
|
@ -13,7 +13,7 @@ ARG TARGETARCH
|
|||
ARG TARGETVARIANT
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||
|
||||
|
||||
RUN apt-get update && \
|
||||
|
@ -263,14 +263,20 @@ EOT
|
|||
# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
|
||||
FROM builder-base AS builder-sd
|
||||
|
||||
COPY . .
|
||||
COPY .git .
|
||||
# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
|
||||
COPY Makefile .
|
||||
COPY go.mod .
|
||||
COPY go.sum .
|
||||
COPY backend/backend.proto ./backend/backend.proto
|
||||
COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
|
||||
COPY pkg/grpc ./pkg/grpc
|
||||
COPY pkg/stablediffusion ./pkg/stablediffusion
|
||||
RUN git init
|
||||
RUN make sources/go-stable-diffusion
|
||||
RUN touch prepare-sources
|
||||
|
||||
RUN make prepare
|
||||
|
||||
|
||||
# stablediffusion does not tolerate a newer version of abseil, build it first
|
||||
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
|
||||
# Actually build the backend
|
||||
RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
|
||||
|
||||
###################################
|
||||
###################################
|
||||
|
@ -285,6 +291,11 @@ COPY --from=grpc /opt/grpc /usr/local
|
|||
# Rebuild with defaults backends
|
||||
WORKDIR /build
|
||||
|
||||
COPY . .
|
||||
COPY .git .
|
||||
|
||||
RUN make prepare
|
||||
|
||||
## Build the binary
|
||||
## If it's CUDA, we want to skip some of the llama-compat backends to save space
|
||||
## We only leave the most CPU-optimized variant and the fallback for the cublas build
|
||||
|
@ -407,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
|
|||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/transformers-musicgen \
|
||||
; fi && \
|
||||
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
make -C backend/python/exllama \
|
||||
; fi
|
||||
|
||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||
|
|
17
Makefile
17
Makefile
|
@ -8,7 +8,7 @@ DETECT_LIBS?=true
|
|||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=4db04784f96757d74f74c8c110c2a00d55e33514
|
||||
CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
|
@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
|||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=5236f0278420ab776d1787c4330678d80219b4b6
|
||||
WHISPER_CPP_VERSION?=a551933542d956ae84634937acd2942eb40efaaf
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
|
@ -534,10 +534,10 @@ protogen-go-clean:
|
|||
$(RM) bin/*
|
||||
|
||||
.PHONY: protogen-python
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||
|
||||
.PHONY: protogen-python-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||
|
||||
.PHONY: autogptq-protogen
|
||||
autogptq-protogen:
|
||||
|
@ -571,14 +571,6 @@ diffusers-protogen:
|
|||
diffusers-protogen-clean:
|
||||
$(MAKE) -C backend/python/diffusers protogen-clean
|
||||
|
||||
.PHONY: exllama-protogen
|
||||
exllama-protogen:
|
||||
$(MAKE) -C backend/python/exllama protogen
|
||||
|
||||
.PHONY: exllama-protogen-clean
|
||||
exllama-protogen-clean:
|
||||
$(MAKE) -C backend/python/exllama protogen-clean
|
||||
|
||||
.PHONY: exllama2-protogen
|
||||
exllama2-protogen:
|
||||
$(MAKE) -C backend/python/exllama2 protogen
|
||||
|
@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python
|
|||
$(MAKE) -C backend/python/parler-tts
|
||||
$(MAKE) -C backend/python/vall-e-x
|
||||
$(MAKE) -C backend/python/openvoice
|
||||
$(MAKE) -C backend/python/exllama
|
||||
$(MAKE) -C backend/python/exllama2
|
||||
|
||||
prepare-test-extra: protogen-python
|
||||
|
|
|
@ -17,11 +17,10 @@
|
|||
#include "common.h"
|
||||
#include "json.hpp"
|
||||
#include "llama.h"
|
||||
#include "grammar-parser.h"
|
||||
#include "backend.pb.h"
|
||||
#include "backend.grpc.pb.h"
|
||||
#include "utils.hpp"
|
||||
|
||||
#include "sampling.h"
|
||||
// include std::regex
|
||||
#include <cstddef>
|
||||
#include <thread>
|
||||
|
@ -203,8 +202,8 @@ struct llama_client_slot
|
|||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct llama_sampling_params sparams;
|
||||
llama_sampling_context *ctx_sampling = nullptr;
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
|
@ -619,7 +618,7 @@ struct llama_server_context
|
|||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
llama_sampling_params default_sparams;
|
||||
gpt_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
|
@ -628,7 +627,7 @@ struct llama_server_context
|
|||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
|
||||
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
|
||||
|
@ -641,7 +640,7 @@ struct llama_server_context
|
|||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||
|
@ -665,6 +664,7 @@ struct llama_server_context
|
|||
slot->params.input_prefix = "";
|
||||
}
|
||||
|
||||
|
||||
if (data.count("input_suffix") != 0)
|
||||
{
|
||||
slot->params.input_suffix = data["input_suffix"];
|
||||
|
@ -683,6 +683,10 @@ struct llama_server_context
|
|||
slot->prompt = "";
|
||||
}
|
||||
|
||||
if (json_value(data, "ignore_eos", false)) {
|
||||
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
|
||||
}
|
||||
/*
|
||||
slot->sparams.penalty_prompt_tokens.clear();
|
||||
slot->sparams.use_penalty_prompt_tokens = false;
|
||||
const auto &penalty_prompt = data.find("penalty_prompt");
|
||||
|
@ -718,14 +722,10 @@ struct llama_server_context
|
|||
slot->sparams.use_penalty_prompt_tokens = true;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
slot->sparams.logit_bias.clear();
|
||||
|
||||
if (json_value(data, "ignore_eos", false))
|
||||
{
|
||||
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||
}
|
||||
|
||||
const auto &logit_bias = data.find("logit_bias");
|
||||
if (logit_bias != data.end() && logit_bias->is_array())
|
||||
{
|
||||
|
@ -753,7 +753,7 @@ struct llama_server_context
|
|||
llama_token tok = el[0].get<llama_token>();
|
||||
if (tok >= 0 && tok < n_vocab)
|
||||
{
|
||||
slot->sparams.logit_bias[tok] = bias;
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
|
@ -761,13 +761,13 @@ struct llama_server_context
|
|||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias[tok] = bias;
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
|
@ -781,24 +781,22 @@ struct llama_server_context
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
const auto &samplers_sequence = data.find("samplers");
|
||||
if (samplers_sequence != data.end() && samplers_sequence->is_array())
|
||||
{
|
||||
|
||||
const auto & samplers = data.find("samplers");
|
||||
if (samplers != data.end() && samplers->is_array()) {
|
||||
std::vector<std::string> sampler_names;
|
||||
for (const auto &sampler_name : *samplers_sequence)
|
||||
{
|
||||
if (sampler_name.is_string())
|
||||
{
|
||||
sampler_names.emplace_back(sampler_name);
|
||||
for (const auto & name : *samplers) {
|
||||
if (name.is_string()) {
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||
slot->sparams.samplers = default_sparams.samplers;
|
||||
}
|
||||
|
||||
|
||||
if (multimodal)
|
||||
{
|
||||
|
@ -875,10 +873,10 @@ struct llama_server_context
|
|||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
llama_sampling_free(slot->ctx_sampling);
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = llama_sampling_init(slot->sparams);
|
||||
llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
all_slots_are_idle = false;
|
||||
|
@ -888,7 +886,7 @@ struct llama_server_context
|
|||
{"task_id", slot->task_id},
|
||||
});
|
||||
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1006,11 +1004,13 @@ struct llama_server_context
|
|||
slot.generated_text += token_str;
|
||||
slot.has_next_token = true;
|
||||
|
||||
/*
|
||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||
{
|
||||
// we can change penalty_prompt_tokens because it is always created from scratch each request
|
||||
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
|
||||
}
|
||||
*/
|
||||
|
||||
// check if there is incomplete UTF-8 character at the end
|
||||
bool incomplete = false;
|
||||
|
@ -1144,13 +1144,11 @@ struct llama_server_context
|
|||
|
||||
json get_formated_generation(llama_client_slot &slot)
|
||||
{
|
||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
std::vector<std::string> samplers_sequence;
|
||||
for (const auto &sampler_type : slot.sparams.samplers_sequence)
|
||||
std::vector<std::string> samplers;
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
|
@ -1165,13 +1163,11 @@ struct llama_server_context
|
|||
{"top_p", slot.sparams.top_p},
|
||||
{"min_p", slot.sparams.min_p},
|
||||
{"tfs_z", slot.sparams.tfs_z},
|
||||
{"typical_p", slot.sparams.typical_p},
|
||||
{"typical_p", slot.sparams.typ_p},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||
{"presence_penalty", slot.sparams.penalty_present},
|
||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
|
||||
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
|
||||
{"mirostat", slot.sparams.mirostat},
|
||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
|
@ -1179,13 +1175,13 @@ struct llama_server_context
|
|||
{"stop", slot.params.antiprompt},
|
||||
{"n_predict", slot.params.n_predict},
|
||||
{"n_keep", params.n_keep},
|
||||
{"ignore_eos", ignore_eos},
|
||||
{"ignore_eos", slot.sparams.ignore_eos},
|
||||
{"stream", slot.params.stream},
|
||||
{"logit_bias", slot.sparams.logit_bias},
|
||||
// {"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"min_keep", slot.sparams.min_keep},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
{"samplers", samplers_sequence}
|
||||
{"samplers", samplers}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -1714,7 +1710,7 @@ struct llama_server_context
|
|||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
|
@ -1726,7 +1722,7 @@ struct llama_server_context
|
|||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
|
@ -1934,9 +1930,9 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
|
@ -1946,19 +1942,14 @@ struct llama_server_context
|
|||
metrics.on_prompt_eval(slot);
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||
result.tok = id;
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
const int32_t n_probs = slot.sparams.n_probs;
|
||||
if (slot.sparams.temp <= 0 && n_probs > 0)
|
||||
{
|
||||
// for llama_sample_token_greedy we need to sort candidates
|
||||
llama_sample_softmax(ctx, &cur_p);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
||||
{
|
||||
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
cur_p->data[i].id,
|
||||
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
|
||||
});
|
||||
}
|
||||
|
||||
if (!process_token(result, slot))
|
||||
|
|
13
backend/cpp/llama/patches/01-llava.patch
Normal file
13
backend/cpp/llama/patches/01-llava.patch
Normal file
|
@ -0,0 +1,13 @@
|
|||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 342042ff..224db9b5 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
- patches_data[i] = i + 1;
|
||||
+ patches_data[i] = i;
|
||||
}
|
||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||
free(patches_data);
|
|
@ -1,5 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
## Patches
|
||||
## Apply patches from the `patches` directory
|
||||
for patch in $(ls patches); do
|
||||
echo "Applying patch $patch"
|
||||
patch -d llama.cpp/ -p1 < patches/$patch
|
||||
done
|
||||
|
||||
cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
|
||||
cp -rfv json.hpp llama.cpp/examples/grpc-server/
|
||||
|
|
|
@ -480,31 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
|
|||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// random string / id
|
||||
//
|
||||
|
||||
static std::string random_string()
|
||||
{
|
||||
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
|
||||
std::string result(32, ' ');
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
result[i] = str[generator() % str.size()];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string gen_chatcmplid()
|
||||
{
|
||||
std::stringstream chatcmplid;
|
||||
chatcmplid << "chatcmpl-" << random_string();
|
||||
return chatcmplid.str();
|
||||
}
|
1
backend/python/exllama/.gitignore
vendored
1
backend/python/exllama/.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
source
|
|
@ -1,25 +0,0 @@
|
|||
export CONDA_ENV_PATH = "exllama.yml"
|
||||
|
||||
.PHONY: exllama
|
||||
exllama: protogen
|
||||
bash install.sh ${CONDA_ENV_PATH}
|
||||
|
||||
.PHONY: run
|
||||
run: protogen
|
||||
@echo "Running exllama..."
|
||||
bash run.sh
|
||||
@echo "exllama run."
|
||||
|
||||
.PHONY: protogen
|
||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
backend_pb2_grpc.py backend_pb2.py:
|
||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
$(RM) -r venv source __pycache__
|
|
@ -1,5 +0,0 @@
|
|||
# Creating a separate environment for the exllama project
|
||||
|
||||
```
|
||||
make exllama
|
||||
```
|
|
@ -1,159 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import grpc
|
||||
from concurrent import futures
|
||||
import time
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os, glob
|
||||
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import version as torch_version
|
||||
|
||||
from source.tokenizer import ExLlamaTokenizer
|
||||
from source.generator import ExLlamaGenerator
|
||||
from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
# Implement the BackendServicer class with the service methods
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def generate(self,prompt, max_new_tokens):
|
||||
self.generator.end_beam_search()
|
||||
|
||||
# Tokenizing the input
|
||||
ids = self.generator.tokenizer.encode(prompt)
|
||||
|
||||
self.generator.gen_begin_reuse(ids)
|
||||
initial_len = self.generator.sequence[0].shape[0]
|
||||
has_leading_space = False
|
||||
decoded_text = ''
|
||||
for i in range(max_new_tokens):
|
||||
token = self.generator.gen_single_token()
|
||||
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
|
||||
has_leading_space = True
|
||||
|
||||
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
|
||||
if has_leading_space:
|
||||
decoded_text = ' ' + decoded_text
|
||||
|
||||
if token.item() == self.generator.tokenizer.eos_token_id:
|
||||
break
|
||||
return decoded_text
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
# https://github.com/turboderp/exllama/blob/master/example_cfg.py
|
||||
model_directory = request.ModelFile
|
||||
|
||||
# Locate files we need within that directory
|
||||
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
|
||||
model_config_path = os.path.join(model_directory, "config.json")
|
||||
st_pattern = os.path.join(model_directory, "*.safetensors")
|
||||
model_path = glob.glob(st_pattern)[0]
|
||||
|
||||
# Create config, model, tokenizer and generator
|
||||
|
||||
config = ExLlamaConfig(model_config_path) # create config from config.json
|
||||
config.model_path = model_path # supply path to model weights file
|
||||
if (request.ContextSize):
|
||||
config.max_seq_len = request.ContextSize # override max sequence length
|
||||
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
|
||||
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
|
||||
|
||||
# Set Rope scaling.
|
||||
if (request.RopeFreqScale):
|
||||
# Alpha value for Rope scaling.
|
||||
# Higher value increases context but adds perplexity.
|
||||
# alpha_value and compress_pos_emb are mutually exclusive.
|
||||
# https://github.com/turboderp/exllama/issues/115
|
||||
config.alpha_value = request.RopeFreqScale
|
||||
config.calculate_rotary_embedding_base()
|
||||
|
||||
model = ExLlama(config) # create ExLlama instance and load the weights
|
||||
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
||||
|
||||
cache = ExLlamaCache(model, batch_size = 2) # create cache for inference
|
||||
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator
|
||||
|
||||
self.generator= generator
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.cache = cache
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def Predict(self, request, context):
|
||||
penalty = 1.15
|
||||
if request.Penalty != 0.0:
|
||||
penalty = request.Penalty
|
||||
self.generator.settings.token_repetition_penalty_max = penalty
|
||||
self.generator.settings.temperature = request.Temperature
|
||||
self.generator.settings.top_k = request.TopK
|
||||
self.generator.settings.top_p = request.TopP
|
||||
|
||||
tokens = 512
|
||||
if request.Tokens != 0:
|
||||
tokens = request.Tokens
|
||||
|
||||
if self.cache.batch_size == 1:
|
||||
del self.cache
|
||||
self.cache = ExLlamaCache(self.model, batch_size=2)
|
||||
self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
|
||||
|
||||
t = self.generate(request.Prompt, tokens)
|
||||
|
||||
# Remove prompt from response if present
|
||||
if request.Prompt in t:
|
||||
t = t.replace(request.Prompt, "")
|
||||
|
||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
||||
|
||||
def PredictStream(self, request, context):
|
||||
# Implement PredictStream RPC
|
||||
#for reply in some_data_generator():
|
||||
# yield reply
|
||||
# Not implemented yet
|
||||
return self.Predict(request, context)
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
# Define the signal handler function
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
# Set the signal handlers for SIGINT and SIGTERM
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument(
|
||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
serve(args.addr)
|
|
@ -1,13 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
LIMIT_TARGETS="cublas"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
installRequirements
|
||||
|
||||
git clone https://github.com/turboderp/exllama $MY_DIR/source
|
||||
uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
|
||||
|
||||
cp -v ./*py $MY_DIR/source/
|
|
@ -1,3 +0,0 @@
|
|||
transformers
|
||||
accelerate
|
||||
torch
|
|
@ -1,4 +0,0 @@
|
|||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch
|
||||
transformers
|
||||
accelerate
|
|
@ -1,3 +0,0 @@
|
|||
torch
|
||||
transformers
|
||||
accelerate
|
|
@ -1,4 +0,0 @@
|
|||
grpcio==1.66.1
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
|
@ -1,7 +0,0 @@
|
|||
#!/bin/bash
|
||||
LIMIT_TARGETS="cublas"
|
||||
BACKEND_FILE="${MY_DIR}/source/backend.py"
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
startBackend $@
|
|
@ -1,6 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
source $(dirname $0)/../common/libbackend.sh
|
||||
|
||||
runUnittests
|
13
core/backend/backend_suite_test.go
Normal file
13
core/backend/backend_suite_test.go
Normal file
|
@ -0,0 +1,13 @@
|
|||
package backend_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestBackend(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Backend test suite")
|
||||
}
|
|
@ -9,6 +9,8 @@ import (
|
|||
"sync"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
|
@ -181,13 +183,37 @@ func Finetune(config config.BackendConfig, input, prediction string) string {
|
|||
mu.Lock()
|
||||
reg, ok := cutstrings[c]
|
||||
if !ok {
|
||||
cutstrings[c] = regexp.MustCompile(c)
|
||||
r, err := regexp.Compile(c)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to compile regex")
|
||||
}
|
||||
cutstrings[c] = r
|
||||
reg = cutstrings[c]
|
||||
}
|
||||
mu.Unlock()
|
||||
prediction = reg.ReplaceAllString(prediction, "")
|
||||
}
|
||||
|
||||
// extract results from the response which can be for instance inside XML tags
|
||||
var predResult string
|
||||
for _, r := range config.ExtractRegex {
|
||||
mu.Lock()
|
||||
reg, ok := cutstrings[r]
|
||||
if !ok {
|
||||
regex, err := regexp.Compile(r)
|
||||
if err != nil {
|
||||
log.Fatal().Err(err).Msg("failed to compile regex")
|
||||
}
|
||||
cutstrings[r] = regex
|
||||
reg = regex
|
||||
}
|
||||
mu.Unlock()
|
||||
predResult += reg.FindString(prediction)
|
||||
}
|
||||
if predResult != "" {
|
||||
prediction = predResult
|
||||
}
|
||||
|
||||
for _, c := range config.TrimSpace {
|
||||
prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
|
||||
}
|
||||
|
|
109
core/backend/llm_test.go
Normal file
109
core/backend/llm_test.go
Normal file
|
@ -0,0 +1,109 @@
|
|||
package backend_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("LLM tests", func() {
|
||||
Context("Finetune LLM output", func() {
|
||||
var (
|
||||
testConfig config.BackendConfig
|
||||
input string
|
||||
prediction string
|
||||
result string
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
testConfig = config.BackendConfig{
|
||||
PredictionOptions: schema.PredictionOptions{
|
||||
Echo: false,
|
||||
},
|
||||
LLMConfig: config.LLMConfig{
|
||||
Cutstrings: []string{`<.*?>`}, // Example regex for removing XML tags
|
||||
ExtractRegex: []string{`<result>(.*?)</result>`}, // Example regex to extract from tags
|
||||
TrimSpace: []string{" ", "\n"},
|
||||
TrimSuffix: []string{".", "!"},
|
||||
},
|
||||
}
|
||||
})
|
||||
|
||||
Context("when echo is enabled", func() {
|
||||
BeforeEach(func() {
|
||||
testConfig.Echo = true
|
||||
input = "Hello"
|
||||
prediction = "World"
|
||||
})
|
||||
|
||||
It("should prepend input to prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("HelloWorld"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when echo is disabled", func() {
|
||||
BeforeEach(func() {
|
||||
testConfig.Echo = false
|
||||
input = "Hello"
|
||||
prediction = "World"
|
||||
})
|
||||
|
||||
It("should not modify the prediction with input", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when cutstrings regex is applied", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "<div>Hello</div> World"
|
||||
})
|
||||
|
||||
It("should remove substrings matching cutstrings regex", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when extract regex is applied", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "<response><result>42</result></response>"
|
||||
})
|
||||
|
||||
It("should extract substrings matching the extract regex", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("42"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when trimming spaces", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = " Hello World "
|
||||
})
|
||||
|
||||
It("should trim spaces from the prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("when trimming suffixes", func() {
|
||||
BeforeEach(func() {
|
||||
input = ""
|
||||
prediction = "Hello World."
|
||||
})
|
||||
|
||||
It("should trim suffixes from the prediction", func() {
|
||||
result = Finetune(testConfig, input, prediction)
|
||||
Expect(result).To(Equal("Hello World"))
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
|
@ -9,7 +9,7 @@ import (
|
|||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
|
||||
|
@ -22,16 +22,16 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
|
|||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
})
|
||||
|
||||
whisperModel, err := ml.BackendLoader(opts...)
|
||||
transcriptionModel, err := ml.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if whisperModel == nil {
|
||||
return nil, fmt.Errorf("could not load whisper model")
|
||||
if transcriptionModel == nil {
|
||||
return nil, fmt.Errorf("could not load transcription model")
|
||||
}
|
||||
|
||||
r, err := whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
|
||||
r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
|
||||
Dst: audio,
|
||||
Language: language,
|
||||
Translate: translate,
|
||||
|
|
|
@ -126,6 +126,7 @@ type LLMConfig struct {
|
|||
Grammar string `yaml:"grammar"`
|
||||
StopWords []string `yaml:"stopwords"`
|
||||
Cutstrings []string `yaml:"cutstrings"`
|
||||
ExtractRegex []string `yaml:"extract_regex"`
|
||||
TrimSpace []string `yaml:"trimspace"`
|
||||
TrimSuffix []string `yaml:"trimsuffix"`
|
||||
|
||||
|
|
|
@ -68,9 +68,9 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
|||
|
||||
textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
|
||||
result = functions.CleanupLLMResult(result, config.FunctionsConfig)
|
||||
results := functions.ParseFunctionCall(result, config.FunctionsConfig)
|
||||
functionResults := functions.ParseFunctionCall(result, config.FunctionsConfig)
|
||||
log.Debug().Msgf("Text content to return: %s", textContentToReturn)
|
||||
noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0
|
||||
noActionToRun := len(functionResults) > 0 && functionResults[0].Name == noAction || len(functionResults) == 0
|
||||
|
||||
switch {
|
||||
case noActionToRun:
|
||||
|
@ -83,7 +83,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
|||
}
|
||||
responses <- initialMessage
|
||||
|
||||
result, err := handleQuestion(config, req, ml, startupOptions, results, result, prompt)
|
||||
result, err := handleQuestion(config, req, ml, startupOptions, functionResults, result, prompt)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("error handling question")
|
||||
return
|
||||
|
@ -105,7 +105,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
|||
responses <- resp
|
||||
|
||||
default:
|
||||
for i, ss := range results {
|
||||
for i, ss := range functionResults {
|
||||
name, args := ss.Name, ss.Arguments
|
||||
|
||||
initialMessage := schema.OpenAIResponse{
|
||||
|
|
2
docs/themes/hugo-theme-relearn
vendored
2
docs/themes/hugo-theme-relearn
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 550a6eeb9252da5ca729f25dc91df6dd3ee9d5ce
|
||||
Subproject commit f696f60f4e44e18a34512b895a7b65a72c801bd8
|
|
@ -1,4 +1,4 @@
|
|||
llama_index==0.11.4
|
||||
llama_index==0.11.7
|
||||
requests==2.32.3
|
||||
weaviate_client==4.6.7
|
||||
transformers
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
langchain==0.2.15
|
||||
openai==1.43.0
|
||||
langchain==0.2.16
|
||||
openai==1.44.0
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
langchain==0.2.15
|
||||
openai==1.43.0
|
||||
langchain==0.2.16
|
||||
openai==1.44.1
|
||||
chromadb==0.5.5
|
||||
llama-index==0.11.4
|
||||
llama-index==0.11.7
|
|
@ -10,15 +10,15 @@ debugpy==1.8.2
|
|||
frozenlist==1.4.1
|
||||
greenlet==3.0.3
|
||||
idna==3.8
|
||||
langchain==0.2.15
|
||||
langchain-community==0.2.15
|
||||
langchain==0.2.16
|
||||
langchain-community==0.2.16
|
||||
marshmallow==3.22.0
|
||||
marshmallow-enum==1.5.1
|
||||
multidict==6.0.5
|
||||
mypy-extensions==1.0.0
|
||||
numexpr==2.10.1
|
||||
numpy==2.1.0
|
||||
openai==1.43.0
|
||||
numpy==2.1.1
|
||||
openai==1.44.0
|
||||
openapi-schema-pydantic==1.2.4
|
||||
packaging>=23.2
|
||||
pydantic==2.8.2
|
||||
|
@ -30,4 +30,4 @@ tqdm==4.66.5
|
|||
typing-inspect==0.9.0
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.2
|
||||
yarl==1.9.7
|
||||
yarl==1.11.0
|
||||
|
|
|
@ -658,6 +658,23 @@
|
|||
- filename: Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
|
||||
sha256: 88bfdca2f6077d789d3e0f161d19711aa208a6d9a02cce96a2276c69413b3594
|
||||
uri: huggingface://mradermacher/Mahou-1.3-llama3.1-8B-GGUF/Mahou-1.3-llama3.1-8B.Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "azure_dusk-v0.2-iq-imatrix"
|
||||
# chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/n3-g_YTk3FY-DBzxXd28E.png
|
||||
urls:
|
||||
- https://huggingface.co/Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
"Following up on Crimson_Dawn-v0.2 we have Azure_Dusk-v0.2! Training on Mistral-Nemo-Base-2407 this time I've added significantly more data, as well as trained using RSLoRA as opposed to regular LoRA. Another key change is training on ChatML as opposed to Mistral Formatting."
|
||||
by Author.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: Azure_Dusk-v0.2-Q4_K_M-imat.gguf
|
||||
sha256: c03a670c00976d14c267a0322374ed488b2a5f4790eb509136ca4e75cbc10cf4
|
||||
uri: huggingface://Lewdiculous/Azure_Dusk-v0.2-GGUF-IQ-Imatrix/Azure_Dusk-v0.2-Q4_K_M-imat.gguf
|
||||
- &deepseek
|
||||
## Deepseek
|
||||
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
|
||||
|
@ -1195,6 +1212,23 @@
|
|||
- filename: Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
|
||||
sha256: cf3465c183bf4ecbccd1b6b480f687e0160475b04c87e2f1e5ebc8baa0f4c7aa
|
||||
uri: huggingface://bartowski/Pantheon-RP-1.6-12b-Nemo-GGUF/Pantheon-RP-1.6-12b-Nemo-Q4_K_M.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mn-12b-lyra-v4-iq-imatrix"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/dVoru83WOpwVjMlgZ_xhA.png
|
||||
#chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
A finetune of Mistral Nemo by Sao10K.
|
||||
Uses the ChatML prompt format.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3
|
||||
uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
- &mudler
|
||||
### START mudler's LocalAI specific-models
|
||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
|
||||
|
@ -1708,6 +1742,48 @@
|
|||
- filename: shieldgemma-9b.i1-Q4_K_M.gguf
|
||||
sha256: ffa7eaadcc0c7d0544fda5b0d86bba3ffa3431b673e5b2135f421cfe65bd8732
|
||||
uri: huggingface://mradermacher/shieldgemma-9b-i1-GGUF/shieldgemma-9b.i1-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "athena-codegemma-2-2b-it"
|
||||
urls:
|
||||
- https://huggingface.co/EpistemeAI/Athena-codegemma-2-2b-it
|
||||
- https://huggingface.co/mradermacher/Athena-codegemma-2-2b-it-GGUF
|
||||
description: |
|
||||
Supervised fine tuned (sft unsloth) for coding with EpistemeAI coding dataset.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Athena-codegemma-2-2b-it.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Athena-codegemma-2-2b-it.Q4_K_M.gguf
|
||||
sha256: 59ce17023438b0da603dd211c7d39f78e7acac4108258ac0818a97a4ca7d64e3
|
||||
uri: huggingface://mradermacher/Athena-codegemma-2-2b-it-GGUF/Athena-codegemma-2-2b-it.Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "datagemma-rag-27b-it"
|
||||
urls:
|
||||
- https://huggingface.co/google/datagemma-rag-27b-it
|
||||
- https://huggingface.co/bartowski/datagemma-rag-27b-it-GGUF
|
||||
description: |
|
||||
DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RAG is used with Retrieval Augmented Generation, where it is trained to take a user query and generate natural language queries that can be understood by Data Commons' existing natural language interface. More information can be found in this research paper.
|
||||
overrides:
|
||||
parameters:
|
||||
model: datagemma-rag-27b-it-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: datagemma-rag-27b-it-Q4_K_M.gguf
|
||||
sha256: 3dfcf51b05e3f0ab0979ad194de350edea71cb14444efa0a9f2ef5bfc80753f8
|
||||
uri: huggingface://bartowski/datagemma-rag-27b-it-GGUF/datagemma-rag-27b-it-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "datagemma-rig-27b-it"
|
||||
urls:
|
||||
- https://huggingface.co/google/datagemma-rig-27b-it
|
||||
- https://huggingface.co/bartowski/datagemma-rig-27b-it-GGUF
|
||||
description: |
|
||||
DataGemma is a series of fine-tuned Gemma 2 models used to help LLMs access and incorporate reliable public statistical data from Data Commons into their responses. DataGemma RIG is used in the retrieval interleaved generation approach (based off of tool-use approaches), where it is trained to annotate a response with natural language queries to Data Commons’ existing natural language interface wherever there are statistics. More information can be found in this research paper.
|
||||
overrides:
|
||||
parameters:
|
||||
model: datagemma-rig-27b-it-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: datagemma-rig-27b-it-Q4_K_M.gguf
|
||||
sha256: a6738ffbb49b6c46d220e2793df85c0538e9ac72398e32a0914ee5e55c3096ad
|
||||
uri: huggingface://bartowski/datagemma-rig-27b-it-GGUF/datagemma-rig-27b-it-Q4_K_M.gguf
|
||||
- &llama3
|
||||
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue