feat(api): allow to pass videos to backends (#3601)

This prepares the API to receive videos as well for video understanding.

It works similarly to images, where the request should be in the form:

{
 "type": "video_url",
 "video_url": { "url": "url or base64 data" }
}

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto 2024-09-19 11:21:59 +02:00 committed by GitHub
parent c6a819e92f
commit fbb9facda4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 47 additions and 27 deletions

View file

@ -640,8 +640,12 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
for _, m := range input.Messages {
images = append(images, m.StringImages...)
}
videos := []string{}
for _, m := range input.Messages {
videos = append(videos, m.StringVideos...)
}
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, ml, *config, o, nil)
if err != nil {
log.Error().Err(err).Msg("model inference failed")
return "", err

View file

@ -27,9 +27,13 @@ func ComputeChoices(
for _, m := range req.Messages {
images = append(images, m.StringImages...)
}
videos := []string{}
for _, m := range req.Messages {
videos = append(videos, m.StringVideos...)
}
// get the model function to call for the result
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, loader, *config, o, tokenCallback)
if err != nil {
return result, backend.TokenUsage{}, err
}

View file

@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
}
// Decode each request's message content
index := 0
imgIndex, vidIndex := 0, 0
for i, m := range input.Messages {
switch content := m.Content.(type) {
case string:
@ -144,20 +144,34 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
dat, _ := json.Marshal(content)
c := []schema.Content{}
json.Unmarshal(dat, &c)
CONTENT:
for _, pp := range c {
if pp.Type == "text" {
switch pp.Type {
case "text":
input.Messages[i].StringContent = pp.Text
} else if pp.Type == "image_url" {
// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
if err == nil {
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
index++
} else {
log.Error().Msgf("Failed encoding image: %s", err)
case "video", "video_url":
// Decode content as base64 either if it's an URL or base64 text
base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
if err != nil {
log.Error().Msgf("Failed encoding video: %s", err)
continue CONTENT
}
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
vidIndex++
case "image_url", "image":
// Decode content as base64 either if it's an URL or base64 text
base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
if err != nil {
log.Error().Msgf("Failed encoding image: %s", err)
continue CONTENT
}
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
// set a placeholder for each image
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
imgIndex++
}
}
}