mirror of
https://github.com/mudler/LocalAI.git
synced 2025-05-28 06:25:00 +00:00
feat: elevenlabs sound-generation
api (#3355)
* initial version of elevenlabs compatible soundgeneration api and cli command Signed-off-by: Dave Lee <dave@gray101.com> * minor cleanup Signed-off-by: Dave Lee <dave@gray101.com> * restore TTS, add test Signed-off-by: Dave Lee <dave@gray101.com> * remove stray s Signed-off-by: Dave Lee <dave@gray101.com> * fix Signed-off-by: Dave Lee <dave@gray101.com> --------- Signed-off-by: Dave Lee <dave@gray101.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
parent
84d6e5a987
commit
81ae92f017
20 changed files with 450 additions and 37 deletions
|
@ -87,7 +87,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
|
|||
case string:
|
||||
protoMessages[i].Content = ct
|
||||
default:
|
||||
return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
|
||||
return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
74
core/backend/soundgeneration.go
Normal file
74
core/backend/soundgeneration.go
Normal file
|
@ -0,0 +1,74 @@
|
|||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
func SoundGeneration(
|
||||
backend string,
|
||||
modelFile string,
|
||||
text string,
|
||||
duration *float32,
|
||||
temperature *float32,
|
||||
doSample *bool,
|
||||
sourceFile *string,
|
||||
sourceDivisor *int32,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig,
|
||||
) (string, *proto.Result, error) {
|
||||
if backend == "" {
|
||||
return "", nil, fmt.Errorf("backend is a required parameter")
|
||||
}
|
||||
|
||||
grpcOpts := gRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(backend),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
|
||||
soundGenModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
if soundGenModel == nil {
|
||||
return "", nil, fmt.Errorf("could not load sound generation model")
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
|
||||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||
}
|
||||
|
||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
|
||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
||||
|
||||
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
|
||||
Text: text,
|
||||
Model: modelFile,
|
||||
Dst: filePath,
|
||||
Sample: doSample,
|
||||
Duration: duration,
|
||||
Temperature: temperature,
|
||||
Src: sourceFile,
|
||||
SrcDivisor: sourceDivisor,
|
||||
})
|
||||
|
||||
// return RPC error if any
|
||||
if !res.Success {
|
||||
return "", nil, fmt.Errorf(res.Message)
|
||||
}
|
||||
|
||||
return filePath, res, err
|
||||
}
|
|
@ -9,31 +9,15 @@ import (
|
|||
"github.com/mudler/LocalAI/core/config"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
func generateUniqueFileName(dir, baseName, ext string) string {
|
||||
counter := 1
|
||||
fileName := baseName + ext
|
||||
|
||||
for {
|
||||
filePath := filepath.Join(dir, fileName)
|
||||
_, err := os.Stat(filePath)
|
||||
if os.IsNotExist(err) {
|
||||
return fileName
|
||||
}
|
||||
|
||||
counter++
|
||||
fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
|
||||
}
|
||||
}
|
||||
|
||||
func ModelTTS(
|
||||
backend,
|
||||
text,
|
||||
modelFile,
|
||||
voice ,
|
||||
voice,
|
||||
language string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
|
@ -66,7 +50,7 @@ func ModelTTS(
|
|||
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
|
||||
}
|
||||
|
||||
fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
|
||||
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
|
||||
filePath := filepath.Join(appConfig.AudioDir, fileName)
|
||||
|
||||
// If the model file is not empty, we pass it joined with the model path
|
||||
|
@ -88,10 +72,10 @@ func ModelTTS(
|
|||
}
|
||||
|
||||
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
|
||||
Text: text,
|
||||
Model: modelPath,
|
||||
Voice: voice,
|
||||
Dst: filePath,
|
||||
Text: text,
|
||||
Model: modelPath,
|
||||
Voice: voice,
|
||||
Dst: filePath,
|
||||
Language: &language,
|
||||
})
|
||||
|
||||
|
|
|
@ -8,12 +8,13 @@ import (
|
|||
var CLI struct {
|
||||
cliContext.Context `embed:""`
|
||||
|
||||
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
|
||||
Federated FederatedCLI `cmd:"" help:"Run LocalAI in federated mode"`
|
||||
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
||||
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
||||
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
||||
Worker worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
|
||||
Util UtilCMD `cmd:"" help:"Utility commands"`
|
||||
Explorer ExplorerCMD `cmd:"" help:"Run p2p explorer"`
|
||||
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
|
||||
Federated FederatedCLI `cmd:"" help:"Run LocalAI in federated mode"`
|
||||
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
|
||||
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
|
||||
SoundGeneration SoundGenerationCMD `cmd:"" help:"Generates audio files from text or audio"`
|
||||
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
|
||||
Worker worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
|
||||
Util UtilCMD `cmd:"" help:"Utility commands"`
|
||||
Explorer ExplorerCMD `cmd:"" help:"Run p2p explorer"`
|
||||
}
|
||||
|
|
110
core/cli/soundgeneration.go
Normal file
110
core/cli/soundgeneration.go
Normal file
|
@ -0,0 +1,110 @@
|
|||
package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
cliContext "github.com/mudler/LocalAI/core/cli/context"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
type SoundGenerationCMD struct {
|
||||
Text []string `arg:""`
|
||||
|
||||
Backend string `short:"b" required:"" help:"Backend to run the SoundGeneration model"`
|
||||
Model string `short:"m" required:"" help:"Model name to run the SoundGeneration"`
|
||||
Duration string `short:"d" help:"If specified, the length of audio to generate in seconds"`
|
||||
Temperature string `short:"t" help:"If specified, the temperature of the generation"`
|
||||
InputFile string `short:"i" help:"If specified, the input file to condition generation upon"`
|
||||
InputFileSampleDivisor string `short:"f" help:"If InputFile and this divisor is specified, the first portion of the sample file will be used"`
|
||||
DoSample bool `short:"s" default:"true" help:"Enables sampling from the model. Better quality at the cost of speed. Defaults to enabled."`
|
||||
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
|
||||
ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
|
||||
}
|
||||
|
||||
func parseToFloat32Ptr(input string) *float32 {
|
||||
f, err := strconv.ParseFloat(input, 32)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
f2 := float32(f)
|
||||
return &f2
|
||||
}
|
||||
|
||||
func parseToInt32Ptr(input string) *int32 {
|
||||
i, err := strconv.ParseInt(input, 10, 32)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
i2 := int32(i)
|
||||
return &i2
|
||||
}
|
||||
|
||||
func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
|
||||
outputFile := t.OutputFile
|
||||
outputDir := t.BackendAssetsPath
|
||||
if outputFile != "" {
|
||||
outputDir = filepath.Dir(outputFile)
|
||||
}
|
||||
|
||||
text := strings.Join(t.Text, " ")
|
||||
|
||||
externalBackends := make(map[string]string)
|
||||
// split ":" to get backend name and the uri
|
||||
for _, v := range t.ExternalGRPCBackends {
|
||||
backend := v[:strings.IndexByte(v, ':')]
|
||||
uri := v[strings.IndexByte(v, ':')+1:]
|
||||
externalBackends[backend] = uri
|
||||
fmt.Printf("TMP externalBackends[%q]=%q\n\n", backend, uri)
|
||||
}
|
||||
|
||||
opts := &config.ApplicationConfig{
|
||||
ModelPath: t.ModelsPath,
|
||||
Context: context.Background(),
|
||||
AudioDir: outputDir,
|
||||
AssetsDestination: t.BackendAssetsPath,
|
||||
ExternalGRPCBackends: externalBackends,
|
||||
}
|
||||
ml := model.NewModelLoader(opts.ModelPath)
|
||||
|
||||
defer func() {
|
||||
err := ml.StopAllGRPC()
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("unable to stop all grpc processes")
|
||||
}
|
||||
}()
|
||||
|
||||
options := config.BackendConfig{}
|
||||
options.SetDefaults()
|
||||
|
||||
var inputFile *string
|
||||
if t.InputFile != "" {
|
||||
inputFile = &t.InputFile
|
||||
}
|
||||
|
||||
filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
|
||||
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
|
||||
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if outputFile != "" {
|
||||
if err := os.Rename(filePath, outputFile); err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Printf("Generate file %s\n", outputFile)
|
||||
} else {
|
||||
fmt.Printf("Generate file %s\n", filePath)
|
||||
}
|
||||
return nil
|
||||
}
|
65
core/http/endpoints/elevenlabs/soundgeneration.go
Normal file
65
core/http/endpoints/elevenlabs/soundgeneration.go
Normal file
|
@ -0,0 +1,65 @@
|
|||
package elevenlabs
|
||||
|
||||
import (
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// SoundGenerationEndpoint is the ElevenLabs SoundGeneration endpoint https://elevenlabs.io/docs/api-reference/sound-generation
|
||||
// @Summary Generates audio from the input text.
|
||||
// @Param request body schema.ElevenLabsSoundGenerationRequest true "query params"
|
||||
// @Success 200 {string} binary "Response"
|
||||
// @Router /v1/sound-generation [post]
|
||||
func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
input := new(schema.ElevenLabsSoundGenerationRequest)
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
|
||||
if err != nil {
|
||||
modelFile = input.ModelID
|
||||
log.Warn().Str("ModelID", input.ModelID).Msg("Model not found in context")
|
||||
}
|
||||
|
||||
cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
|
||||
config.LoadOptionDebug(appConfig.Debug),
|
||||
config.LoadOptionThreads(appConfig.Threads),
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
if err != nil {
|
||||
modelFile = input.ModelID
|
||||
log.Warn().Str("Request ModelID", input.ModelID).Err(err).Msg("error during LoadBackendConfigFileByName, using request ModelID")
|
||||
} else {
|
||||
if input.ModelID != "" {
|
||||
modelFile = input.ModelID
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
}
|
||||
log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
|
||||
|
||||
if input.Duration != nil {
|
||||
log.Debug().Float32("duration", *input.Duration).Msg("duration set")
|
||||
}
|
||||
if input.Temperature != nil {
|
||||
log.Debug().Float32("temperature", *input.Temperature).Msg("temperature set")
|
||||
}
|
||||
|
||||
// TODO: Support uploading files?
|
||||
filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.Download(filePath)
|
||||
|
||||
}
|
||||
}
|
|
@ -16,4 +16,6 @@ func RegisterElevenLabsRoutes(app *fiber.App,
|
|||
// Elevenlabs
|
||||
app.Post("/v1/text-to-speech/:voice-id", auth, elevenlabs.TTSEndpoint(cl, ml, appConfig))
|
||||
|
||||
app.Post("/v1/sound-generation", auth, elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
|
||||
|
||||
}
|
||||
|
|
|
@ -4,3 +4,11 @@ type ElevenLabsTTSRequest struct {
|
|||
Text string `json:"text" yaml:"text"`
|
||||
ModelID string `json:"model_id" yaml:"model_id"`
|
||||
}
|
||||
|
||||
type ElevenLabsSoundGenerationRequest struct {
|
||||
Text string `json:"text" yaml:"text"`
|
||||
ModelID string `json:"model_id" yaml:"model_id"`
|
||||
Duration *float32 `json:"duration_seconds,omitempty" yaml:"duration_seconds,omitempty"`
|
||||
Temperature *float32 `json:"prompt_influence,omitempty" yaml:"prompt_influence,omitempty"`
|
||||
DoSample *bool `json:"do_sample,omitempty" yaml:"do_sample,omitempty"`
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue