feat: elevenlabs sound-generation api (#3355)

* initial version of elevenlabs compatible soundgeneration api and cli command

Signed-off-by: Dave Lee <dave@gray101.com>

* minor cleanup

Signed-off-by: Dave Lee <dave@gray101.com>

* restore TTS, add test

Signed-off-by: Dave Lee <dave@gray101.com>

* remove stray s

Signed-off-by: Dave Lee <dave@gray101.com>

* fix

Signed-off-by: Dave Lee <dave@gray101.com>

---------

Signed-off-by: Dave Lee <dave@gray101.com>
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Dave 2024-08-23 20:20:28 -04:00 committed by GitHub
parent 84d6e5a987
commit 81ae92f017
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 450 additions and 37 deletions

View file

@ -87,7 +87,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
case string:
protoMessages[i].Content = ct
default:
return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
}
}
}

View file

@ -0,0 +1,74 @@
package backend
import (
"context"
"fmt"
"os"
"path/filepath"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/utils"
)
func SoundGeneration(
backend string,
modelFile string,
text string,
duration *float32,
temperature *float32,
doSample *bool,
sourceFile *string,
sourceDivisor *int32,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
if backend == "" {
return "", nil, fmt.Errorf("backend is a required parameter")
}
grpcOpts := gRPCModelOpts(backendConfig)
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
model.WithBackendString(backend),
model.WithModel(modelFile),
model.WithContext(appConfig.Context),
model.WithAssetDir(appConfig.AssetsDestination),
model.WithLoadGRPCLoadModelOpts(grpcOpts),
})
soundGenModel, err := loader.BackendLoader(opts...)
if err != nil {
return "", nil, err
}
if soundGenModel == nil {
return "", nil, fmt.Errorf("could not load sound generation model")
}
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
}
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
filePath := filepath.Join(appConfig.AudioDir, fileName)
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
Text: text,
Model: modelFile,
Dst: filePath,
Sample: doSample,
Duration: duration,
Temperature: temperature,
Src: sourceFile,
SrcDivisor: sourceDivisor,
})
// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf(res.Message)
}
return filePath, res, err
}

View file

@ -9,31 +9,15 @@ import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/grpc/proto"
model "github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/utils"
)
func generateUniqueFileName(dir, baseName, ext string) string {
counter := 1
fileName := baseName + ext
for {
filePath := filepath.Join(dir, fileName)
_, err := os.Stat(filePath)
if os.IsNotExist(err) {
return fileName
}
counter++
fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
}
}
func ModelTTS(
backend,
text,
modelFile,
voice ,
voice,
language string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
@ -66,7 +50,7 @@ func ModelTTS(
return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
}
fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
filePath := filepath.Join(appConfig.AudioDir, fileName)
// If the model file is not empty, we pass it joined with the model path
@ -88,10 +72,10 @@ func ModelTTS(
}
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})