* Revert "fix(fncall): fix regression introduced in #1963 (#2048)"

This reverts commit 6b06d4e0af.

* Revert "fix: action-tmate back to upstream, dead code removal (#2038)"

This reverts commit fdec8a9d00.

* Revert "feat(grpc): return consumed token count and update response accordingly (#2035)"

This reverts commit e843d7df0e.

* Revert "refactor: backend/service split, channel-based llm flow (#1963)"

This reverts commit eed5706994.

* feat(grpc): return consumed token count and update response accordingly

Fixes: #1920

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-04-17 23:33:49 +02:00 committed by GitHub
parent af8c705ecd
commit af9e5a2d05
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
52 changed files with 2295 additions and 3065 deletions

View file

@ -11,22 +11,17 @@ import (
"github.com/go-skynet/LocalAI/core/config"
"github.com/go-skynet/LocalAI/core/schema"
"github.com/rs/zerolog/log"
"github.com/go-skynet/LocalAI/pkg/concurrency"
"github.com/go-skynet/LocalAI/pkg/gallery"
"github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/go-skynet/LocalAI/pkg/grpc/proto"
"github.com/go-skynet/LocalAI/pkg/model"
model "github.com/go-skynet/LocalAI/pkg/model"
"github.com/go-skynet/LocalAI/pkg/utils"
)
type LLMRequest struct {
Id int // TODO Remove if not used.
Text string
Images []string
RawMessages []schema.Message
// TODO: Other Modalities?
type LLMResponse struct {
Response string // should this be []byte?
Usage TokenUsage
}
type TokenUsage struct {
@ -34,94 +29,57 @@ type TokenUsage struct {
Completion int
}
type LLMResponse struct {
Request *LLMRequest
Response string // should this be []byte?
Usage TokenUsage
}
// TODO: Does this belong here or in core/services/openai.go?
type LLMResponseBundle struct {
Request *schema.OpenAIRequest
Response []schema.Choice
Usage TokenUsage
}
type LLMBackendService struct {
bcl *config.BackendConfigLoader
ml *model.ModelLoader
appConfig *config.ApplicationConfig
ftMutex sync.Mutex
cutstrings map[string]*regexp.Regexp
}
func NewLLMBackendService(ml *model.ModelLoader, bcl *config.BackendConfigLoader, appConfig *config.ApplicationConfig) *LLMBackendService {
return &LLMBackendService{
bcl: bcl,
ml: ml,
appConfig: appConfig,
ftMutex: sync.Mutex{},
cutstrings: make(map[string]*regexp.Regexp),
func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
threads := c.Threads
if *threads == 0 && o.Threads != 0 {
threads = &o.Threads
}
}
// TODO: Should ctx param be removed and replaced with hardcoded req.Context?
func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest, bc *config.BackendConfig, enableTokenChannel bool) (
resultChannel <-chan concurrency.ErrorOr[*LLMResponse], tokenChannel <-chan concurrency.ErrorOr[*LLMResponse], err error) {
threads := bc.Threads
if (threads == nil || *threads == 0) && llmbs.appConfig.Threads != 0 {
threads = &llmbs.appConfig.Threads
}
grpcOpts := gRPCModelOpts(bc)
grpcOpts := gRPCModelOpts(c)
var inferenceModel grpc.Backend
var err error
opts := modelOpts(bc, llmbs.appConfig, []model.Option{
opts := modelOpts(c, o, []model.Option{
model.WithLoadGRPCLoadModelOpts(grpcOpts),
model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
model.WithAssetDir(llmbs.appConfig.AssetsDestination),
model.WithModel(bc.Model),
model.WithContext(llmbs.appConfig.Context),
model.WithAssetDir(o.AssetsDestination),
model.WithModel(modelFile),
model.WithContext(o.Context),
})
if bc.Backend != "" {
opts = append(opts, model.WithBackendString(bc.Backend))
if c.Backend != "" {
opts = append(opts, model.WithBackendString(c.Backend))
}
// Check if bc.Model exists, if it doesn't try to load it from the gallery
if llmbs.appConfig.AutoloadGalleries { // experimental
if _, err := os.Stat(bc.Model); os.IsNotExist(err) {
// Check if the modelFile exists, if it doesn't try to load it from the gallery
if o.AutoloadGalleries { // experimental
if _, err := os.Stat(modelFile); os.IsNotExist(err) {
utils.ResetDownloadTimers()
// if we failed to load the model, we try to download it
err := gallery.InstallModelFromGalleryByName(llmbs.appConfig.Galleries, bc.Model, llmbs.appConfig.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
if err != nil {
return nil, nil, err
return nil, err
}
}
}
if bc.Backend == "" {
log.Debug().Msgf("backend not known for %q, falling back to greedy loader to find it", bc.Model)
inferenceModel, err = llmbs.ml.GreedyLoader(opts...)
if c.Backend == "" {
inferenceModel, err = loader.GreedyLoader(opts...)
} else {
inferenceModel, err = llmbs.ml.BackendLoader(opts...)
inferenceModel, err = loader.BackendLoader(opts...)
}
if err != nil {
log.Error().Err(err).Msg("[llmbs.Inference] failed to load a backend")
return
return nil, err
}
grpcPredOpts := gRPCPredictOpts(bc, llmbs.appConfig.ModelPath)
grpcPredOpts.Prompt = req.Text
grpcPredOpts.Images = req.Images
if bc.TemplateConfig.UseTokenizerTemplate && req.Text == "" {
grpcPredOpts.UseTokenizerTemplate = true
protoMessages := make([]*proto.Message, len(req.RawMessages), len(req.RawMessages))
for i, message := range req.RawMessages {
var protoMessages []*proto.Message
// if we are using the tokenizer template, we need to convert the messages to proto messages
// unless the prompt has already been tokenized (non-chat endpoints + functions)
if c.TemplateConfig.UseTokenizerTemplate && s == "" {
protoMessages = make([]*proto.Message, len(messages), len(messages))
for i, message := range messages {
protoMessages[i] = &proto.Message{
Role: message.Role,
}
@ -129,32 +87,47 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
case string:
protoMessages[i].Content = ct
default:
err = fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
return
return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
}
}
}
tokenUsage := TokenUsage{}
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
fn := func() (LLMResponse, error) {
opts := gRPCPredictOpts(c, loader.ModelPath)
opts.Prompt = s
opts.Messages = protoMessages
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
opts.Images = images
promptInfo, pErr := inferenceModel.TokenizeString(ctx, grpcPredOpts)
if pErr == nil && promptInfo.Length > 0 {
tokenUsage.Prompt = int(promptInfo.Length)
}
tokenUsage := TokenUsage{}
rawResultChannel := make(chan concurrency.ErrorOr[*LLMResponse])
// TODO this next line is the biggest argument for taking named return values _back_ out!!!
var rawTokenChannel chan concurrency.ErrorOr[*LLMResponse]
// check the per-model feature flag for usage, since tokenCallback may have a cost.
// Defaults to off as for now it is still experimental
if c.FeatureFlag.Enabled("usage") {
userTokenCallback := tokenCallback
if userTokenCallback == nil {
userTokenCallback = func(token string, usage TokenUsage) bool {
return true
}
}
if enableTokenChannel {
rawTokenChannel = make(chan concurrency.ErrorOr[*LLMResponse])
promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
if pErr == nil && promptInfo.Length > 0 {
tokenUsage.Prompt = int(promptInfo.Length)
}
// TODO Needs better name
ss := ""
tokenCallback = func(token string, usage TokenUsage) bool {
tokenUsage.Completion++
return userTokenCallback(token, tokenUsage)
}
}
if tokenCallback != nil {
ss := ""
go func() {
var partialRune []byte
err := inferenceModel.PredictStream(ctx, grpcPredOpts, func(chars []byte) {
err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
partialRune = append(partialRune, chars...)
for len(partialRune) > 0 {
@ -164,126 +137,54 @@ func (llmbs *LLMBackendService) Inference(ctx context.Context, req *LLMRequest,
break
}
tokenUsage.Completion++
rawTokenChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
Response: string(r),
Usage: tokenUsage,
}}
tokenCallback(string(r), tokenUsage)
ss += string(r)
partialRune = partialRune[size:]
}
})
close(rawTokenChannel)
return LLMResponse{
Response: ss,
Usage: tokenUsage,
}, err
} else {
// TODO: Is the chicken bit the only way to get here? is that acceptable?
reply, err := inferenceModel.Predict(ctx, opts)
if err != nil {
rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
} else {
rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
Response: ss,
Usage: tokenUsage,
}}
return LLMResponse{}, err
}
close(rawResultChannel)
}()
} else {
go func() {
reply, err := inferenceModel.Predict(ctx, grpcPredOpts)
if tokenUsage.Prompt == 0 {
tokenUsage.Prompt = int(reply.PromptTokens)
}
if tokenUsage.Completion == 0 {
tokenUsage.Completion = int(reply.Tokens)
}
if err != nil {
rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Error: err}
close(rawResultChannel)
} else {
rawResultChannel <- concurrency.ErrorOr[*LLMResponse]{Value: &LLMResponse{
Response: string(reply.Message),
Usage: tokenUsage,
}}
close(rawResultChannel)
}
}()
return LLMResponse{
Response: string(reply.Message),
Usage: tokenUsage,
}, err
}
}
resultChannel = rawResultChannel
tokenChannel = rawTokenChannel
return
return fn, nil
}
// TODO: Should predInput be a seperate param still, or should this fn handle extracting it from request??
func (llmbs *LLMBackendService) GenerateText(predInput string, request *schema.OpenAIRequest, bc *config.BackendConfig,
mappingFn func(*LLMResponse) schema.Choice, enableCompletionChannels bool, enableTokenChannels bool) (
// Returns:
resultChannel <-chan concurrency.ErrorOr[*LLMResponseBundle], completionChannels []<-chan concurrency.ErrorOr[*LLMResponse], tokenChannels []<-chan concurrency.ErrorOr[*LLMResponse], err error) {
var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
var mu sync.Mutex = sync.Mutex{}
rawChannel := make(chan concurrency.ErrorOr[*LLMResponseBundle])
resultChannel = rawChannel
if request.N == 0 { // number of completions to return
request.N = 1
}
images := []string{}
for _, m := range request.Messages {
images = append(images, m.StringImages...)
}
for i := 0; i < request.N; i++ {
individualResultChannel, tokenChannel, infErr := llmbs.Inference(request.Context, &LLMRequest{
Text: predInput,
Images: images,
RawMessages: request.Messages,
}, bc, enableTokenChannels)
if infErr != nil {
err = infErr // Avoids complaints about redeclaring err but looks dumb
return
}
completionChannels = append(completionChannels, individualResultChannel)
tokenChannels = append(tokenChannels, tokenChannel)
}
go func() {
initialBundle := LLMResponseBundle{
Request: request,
Response: []schema.Choice{},
Usage: TokenUsage{},
}
wg := concurrency.SliceOfChannelsReducer(completionChannels, rawChannel, func(iv concurrency.ErrorOr[*LLMResponse], ov concurrency.ErrorOr[*LLMResponseBundle]) concurrency.ErrorOr[*LLMResponseBundle] {
if iv.Error != nil {
ov.Error = iv.Error
// TODO: Decide if we should wipe partials or not?
return ov
}
ov.Value.Usage.Prompt += iv.Value.Usage.Prompt
ov.Value.Usage.Completion += iv.Value.Usage.Completion
ov.Value.Response = append(ov.Value.Response, mappingFn(iv.Value))
return ov
}, concurrency.ErrorOr[*LLMResponseBundle]{Value: &initialBundle}, true)
wg.Wait()
}()
return
}
func (llmbs *LLMBackendService) Finetune(config config.BackendConfig, input, prediction string) string {
func Finetune(config config.BackendConfig, input, prediction string) string {
if config.Echo {
prediction = input + prediction
}
for _, c := range config.Cutstrings {
llmbs.ftMutex.Lock()
reg, ok := llmbs.cutstrings[c]
mu.Lock()
reg, ok := cutstrings[c]
if !ok {
llmbs.cutstrings[c] = regexp.MustCompile(c)
reg = llmbs.cutstrings[c]
cutstrings[c] = regexp.MustCompile(c)
reg = cutstrings[c]
}
llmbs.ftMutex.Unlock()
mu.Unlock()
prediction = reg.ReplaceAllString(prediction, "")
}