mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-20 02:24:59 +00:00
feat(llama.cpp): Add support to grammar triggers (#4733)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
d79f02ea09
commit
1d6afbd65d
4 changed files with 46 additions and 1 deletions
|
@ -163,6 +163,11 @@ message Reply {
|
||||||
double timing_token_generation = 5;
|
double timing_token_generation = 5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
message GrammarTrigger {
|
||||||
|
string word = 1;
|
||||||
|
bool at_start = 2;
|
||||||
|
}
|
||||||
|
|
||||||
message ModelOptions {
|
message ModelOptions {
|
||||||
string Model = 1;
|
string Model = 1;
|
||||||
int32 ContextSize = 2;
|
int32 ContextSize = 2;
|
||||||
|
@ -247,6 +252,8 @@ message ModelOptions {
|
||||||
|
|
||||||
string CacheTypeKey = 63;
|
string CacheTypeKey = 63;
|
||||||
string CacheTypeValue = 64;
|
string CacheTypeValue = 64;
|
||||||
|
|
||||||
|
repeated GrammarTrigger GrammarTriggers = 65;
|
||||||
}
|
}
|
||||||
|
|
||||||
message Result {
|
message Result {
|
||||||
|
|
|
@ -468,6 +468,9 @@ struct llama_server_context
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true;
|
||||||
bool has_eos_token = true;
|
bool has_eos_token = true;
|
||||||
|
|
||||||
|
bool grammar_lazy = false;
|
||||||
|
std::vector<common_grammar_trigger> grammar_trigger_words;
|
||||||
|
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
|
|
||||||
// system prompt
|
// system prompt
|
||||||
|
@ -706,6 +709,8 @@ struct llama_server_context
|
||||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
slot->sparams.grammar_trigger_words = grammar_trigger_words;
|
||||||
|
slot->sparams.grammar_lazy = grammar_lazy;
|
||||||
|
|
||||||
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
|
||||||
// Might be better to reject the request with a 400 ?
|
// Might be better to reject the request with a 400 ?
|
||||||
|
@ -2374,6 +2379,21 @@ static void params_parse(const backend::ModelOptions* request,
|
||||||
if ( request->ropefreqscale() != 0.0f ) {
|
if ( request->ropefreqscale() != 0.0f ) {
|
||||||
params.rope_freq_scale = request->ropefreqscale();
|
params.rope_freq_scale = request->ropefreqscale();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (request->grammartriggers_size() > 0) {
|
||||||
|
LOG_INFO("configuring grammar triggers", {});
|
||||||
|
llama.grammar_lazy = true;
|
||||||
|
for (int i = 0; i < request->grammartriggers_size(); i++) {
|
||||||
|
common_grammar_trigger trigger;
|
||||||
|
trigger.word = request->grammartriggers(i).word();
|
||||||
|
trigger.at_start = request->grammartriggers(i).at_start();
|
||||||
|
llama.grammar_trigger_words.push_back(trigger);
|
||||||
|
LOG_INFO("grammar trigger", {
|
||||||
|
{ "word", trigger.word },
|
||||||
|
{ "at_start", trigger.at_start }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -118,9 +118,19 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||||
nGPULayers = *c.NGPULayers
|
nGPULayers = *c.NGPULayers
|
||||||
}
|
}
|
||||||
|
|
||||||
|
triggers := make([]*pb.GrammarTrigger, 0)
|
||||||
|
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
|
||||||
|
triggers = append(triggers, &pb.GrammarTrigger{
|
||||||
|
Word: t.Word,
|
||||||
|
AtStart: t.AtStart,
|
||||||
|
})
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
return &pb.ModelOptions{
|
return &pb.ModelOptions{
|
||||||
CUDA: c.CUDA || c.Diffusers.CUDA,
|
CUDA: c.CUDA || c.Diffusers.CUDA,
|
||||||
SchedulerType: c.Diffusers.SchedulerType,
|
SchedulerType: c.Diffusers.SchedulerType,
|
||||||
|
GrammarTriggers: triggers,
|
||||||
PipelineType: c.Diffusers.PipelineType,
|
PipelineType: c.Diffusers.PipelineType,
|
||||||
CFGScale: c.CFGScale,
|
CFGScale: c.CFGScale,
|
||||||
LoraAdapter: c.LoraAdapter,
|
LoraAdapter: c.LoraAdapter,
|
||||||
|
|
|
@ -47,6 +47,14 @@ type GrammarConfig struct {
|
||||||
// SchemaType can be configured to use a specific schema type to force the grammar
|
// SchemaType can be configured to use a specific schema type to force the grammar
|
||||||
// available : json, llama3.1
|
// available : json, llama3.1
|
||||||
SchemaType string `yaml:"schema_type"`
|
SchemaType string `yaml:"schema_type"`
|
||||||
|
|
||||||
|
GrammarTriggers []GrammarTrigger `yaml:"triggers"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type GrammarTrigger struct {
|
||||||
|
// Trigger is the string that triggers the grammar
|
||||||
|
Word string `yaml:"word"`
|
||||||
|
AtStart bool `yaml:"at_start"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// FunctionsConfig is the configuration for the tool/function call.
|
// FunctionsConfig is the configuration for the tool/function call.
|
||||||
|
@ -361,6 +369,6 @@ func ParseFunctionCallArgs(functionArguments string, functionConfig FunctionsCon
|
||||||
}
|
}
|
||||||
|
|
||||||
jsonBytes, _ := json.Marshal(args)
|
jsonBytes, _ := json.Marshal(args)
|
||||||
|
|
||||||
return string(jsonBytes)
|
return string(jsonBytes)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue