mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-28 14:35:00 +00:00
feat(api): allow to pass videos to backends (#3601)
This prepares the API to receive videos as well for video understanding. It works similarly to images, where the request should be in the form: { "type": "video_url", "video_url": { "url": "url or base64 data" } } Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
parent
c6a819e92f
commit
fbb9facda4
8 changed files with 47 additions and 27 deletions
|
@ -31,7 +31,7 @@ type TokenUsage struct {
|
|||
Completion int
|
||||
}
|
||||
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
modelFile := c.Model
|
||||
threads := c.Threads
|
||||
if *threads == 0 && o.Threads != 0 {
|
||||
|
@ -101,6 +101,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
|||
opts.Messages = protoMessages
|
||||
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
|
||||
opts.Images = images
|
||||
opts.Videos = videos
|
||||
|
||||
tokenUsage := TokenUsage{}
|
||||
|
||||
|
|
|
@ -640,8 +640,12 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
|
|||
for _, m := range input.Messages {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
videos := []string{}
|
||||
for _, m := range input.Messages {
|
||||
videos = append(videos, m.StringVideos...)
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, ml, *config, o, nil)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("model inference failed")
|
||||
return "", err
|
||||
|
|
|
@ -27,9 +27,13 @@ func ComputeChoices(
|
|||
for _, m := range req.Messages {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
videos := []string{}
|
||||
for _, m := range req.Messages {
|
||||
videos = append(videos, m.StringVideos...)
|
||||
}
|
||||
|
||||
// get the model function to call for the result
|
||||
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
|
||||
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, loader, *config, o, tokenCallback)
|
||||
if err != nil {
|
||||
return result, backend.TokenUsage{}, err
|
||||
}
|
||||
|
|
|
@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
|||
}
|
||||
|
||||
// Decode each request's message content
|
||||
index := 0
|
||||
imgIndex, vidIndex := 0, 0
|
||||
for i, m := range input.Messages {
|
||||
switch content := m.Content.(type) {
|
||||
case string:
|
||||
|
@ -144,20 +144,34 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
|||
dat, _ := json.Marshal(content)
|
||||
c := []schema.Content{}
|
||||
json.Unmarshal(dat, &c)
|
||||
CONTENT:
|
||||
for _, pp := range c {
|
||||
if pp.Type == "text" {
|
||||
switch pp.Type {
|
||||
case "text":
|
||||
input.Messages[i].StringContent = pp.Text
|
||||
} else if pp.Type == "image_url" {
|
||||
// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
|
||||
base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
|
||||
if err == nil {
|
||||
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
|
||||
index++
|
||||
} else {
|
||||
log.Error().Msgf("Failed encoding image: %s", err)
|
||||
case "video", "video_url":
|
||||
// Decode content as base64 either if it's an URL or base64 text
|
||||
base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
|
||||
if err != nil {
|
||||
log.Error().Msgf("Failed encoding video: %s", err)
|
||||
continue CONTENT
|
||||
}
|
||||
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
|
||||
vidIndex++
|
||||
case "image_url", "image":
|
||||
// Decode content as base64 either if it's an URL or base64 text
|
||||
|
||||
base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
|
||||
if err != nil {
|
||||
log.Error().Msgf("Failed encoding image: %s", err)
|
||||
continue CONTENT
|
||||
}
|
||||
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
|
||||
imgIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,6 +58,7 @@ type Content struct {
|
|||
Type string `json:"type" yaml:"type"`
|
||||
Text string `json:"text" yaml:"text"`
|
||||
ImageURL ContentURL `json:"image_url" yaml:"image_url"`
|
||||
VideoURL ContentURL `json:"video_url" yaml:"video_url"`
|
||||
}
|
||||
|
||||
type ContentURL struct {
|
||||
|
@ -76,6 +77,7 @@ type Message struct {
|
|||
|
||||
StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"`
|
||||
StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
|
||||
StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
|
||||
|
||||
// A result of a function call
|
||||
FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue