TTS API improvements (#2308)

* update doc on COQUI_LANGUAGE env variable Signed-off-by: blob42 <contact@blob42.xyz> * return errors from tts gRPC backend Signed-off-by: blob42 <contact@blob42.xyz> * handle speaker_id and language in coqui TTS backend Signed-off-by: blob42 <contact@blob42.xyz> * TTS endpoint: add optional language paramter Signed-off-by: blob42 <contact@blob42.xyz> * tts fix: empty language string breaks non-multilingual models Signed-off-by: blob42 <contact@blob42.xyz> * allow tts param definition in config file - consolidate TTS options under `tts` config entry Signed-off-by: blob42 <contact@blob42.xyz> * tts: update doc Signed-off-by: blob42 <contact@blob42.xyz> --------- Signed-off-by: blob42 <contact@blob42.xyz> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2025-05-28 06:25:00 +00:00 · 2024-06-01 20:26:27 +02:00 · 2024-06-01 20:26:27 +02:00 · b99182c8d4
commit b99182c8d4
parent 95c65d67f5
10 changed files with 166 additions and 78 deletions
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)

-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@ -12,10 +12,13 @@ import (
 )

 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-// @Summary Generates audio from the input text.
-// @Param request body schema.TTSRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/audio/speech [post]
+//	@Summary	Generates audio from the input text.
+//  @Accept json
+//  @Produce audio/x-wav
+//	@Param		request	body		schema.TTSRequest	true	"query params"
+//	@Success	200		{string}	binary				"generated audio/wav file"
+//	@Router		/v1/audio/speech [post]
+//	@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		)

 		if err != nil {
+			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Backend = input.Backend
 		}

-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		if input.Language != "" {
+			cfg.Language = input.Language
+		}
+
+		if input.Voice != "" {
+			cfg.Voice = input.Voice
+		}
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}