feat: elevenlabs sound-generation api (#3355)

* initial version of elevenlabs compatible soundgeneration api and cli command Signed-off-by: Dave Lee <dave@gray101.com> * minor cleanup Signed-off-by: Dave Lee <dave@gray101.com> * restore TTS, add test Signed-off-by: Dave Lee <dave@gray101.com> * remove stray s Signed-off-by: Dave Lee <dave@gray101.com> * fix Signed-off-by: Dave Lee <dave@gray101.com> --------- Signed-off-by: Dave Lee <dave@gray101.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2025-05-20 10:35:01 +00:00 · 2024-08-23 20:20:28 -04:00 · 2024-08-23 20:20:28 -04:00 · 81ae92f017
commit 81ae92f017
parent 84d6e5a987
20 changed files with 450 additions and 37 deletions
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@ -87,7 +87,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 			case string:
 				protoMessages[i].Content = ct
 			default:
-				return nil, fmt.Errorf("Unsupported type for schema.Message.Content for inference: %T", ct)
+				return nil, fmt.Errorf("unsupported type for schema.Message.Content for inference: %T", ct)
 			}
 		}
 	}
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@ -0,0 +1,74 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/utils"
+)
+
+func SoundGeneration(
+	backend string,
+	modelFile string,
+	text string,
+	duration *float32,
+	temperature *float32,
+	doSample *bool,
+	sourceFile *string,
+	sourceDivisor *int32,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	backendConfig config.BackendConfig,
+) (string, *proto.Result, error) {
+	if backend == "" {
+		return "", nil, fmt.Errorf("backend is a required parameter")
+	}
+
+	grpcOpts := gRPCModelOpts(backendConfig)
+	opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
+		model.WithBackendString(backend),
+		model.WithModel(modelFile),
+		model.WithContext(appConfig.Context),
+		model.WithAssetDir(appConfig.AssetsDestination),
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+	})
+
+	soundGenModel, err := loader.BackendLoader(opts...)
+	if err != nil {
+		return "", nil, err
+	}
+
+	if soundGenModel == nil {
+		return "", nil, fmt.Errorf("could not load sound generation model")
+	}
+
+	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
+		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
+	}
+
+	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
+	filePath := filepath.Join(appConfig.AudioDir, fileName)
+
+	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
+		Text:        text,
+		Model:       modelFile,
+		Dst:         filePath,
+		Sample:      doSample,
+		Duration:    duration,
+		Temperature: temperature,
+		Src:         sourceFile,
+		SrcDivisor:  sourceDivisor,
+	})
+
+	// return RPC error if any
+	if !res.Success {
+		return "", nil, fmt.Errorf(res.Message)
+	}
+
+	return filePath, res, err
+}
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@ -9,31 +9,15 @@ import (
 	"github.com/mudler/LocalAI/core/config"

 	"github.com/mudler/LocalAI/pkg/grpc/proto"
-	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 )

-func generateUniqueFileName(dir, baseName, ext string) string {
-	counter := 1
-	fileName := baseName + ext
-
-	for {
-		filePath := filepath.Join(dir, fileName)
-		_, err := os.Stat(filePath)
-		if os.IsNotExist(err) {
-			return fileName
-		}
-
-		counter++
-		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
-	}
-}
-
 func ModelTTS(
 	backend,
 	text,
 	modelFile,
-	voice ,
+	voice,
 	language string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
@ -66,7 +50,7 @@ func ModelTTS(
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}

-	fileName := generateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
+	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
 	filePath := filepath.Join(appConfig.AudioDir, fileName)

 	// If the model file is not empty, we pass it joined with the model path
@ -88,10 +72,10 @@ func ModelTTS(
 	}

 	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
-		Text:  text,
-		Model: modelPath,
-		Voice: voice,
-		Dst:   filePath,
+		Text:     text,
+		Model:    modelPath,
+		Voice:    voice,
+		Dst:      filePath,
 		Language: &language,
 	})