TTS API improvements (#2308)

* update doc on COQUI_LANGUAGE env variable

Signed-off-by: blob42 <contact@blob42.xyz>

* return errors from tts gRPC backend

Signed-off-by: blob42 <contact@blob42.xyz>

* handle speaker_id and language in coqui TTS backend

Signed-off-by: blob42 <contact@blob42.xyz>

* TTS endpoint: add optional language paramter

Signed-off-by: blob42 <contact@blob42.xyz>

* tts fix: empty language string breaks non-multilingual models

Signed-off-by: blob42 <contact@blob42.xyz>

* allow tts param definition in config file

- consolidate TTS options under `tts` config entry

Signed-off-by: blob42 <contact@blob42.xyz>

* tts: update doc

Signed-off-by: blob42 <contact@blob42.xyz>

---------

Signed-off-by: blob42 <contact@blob42.xyz>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
Chakib Benziane 2024-06-01 20:26:27 +02:00 committed by GitHub
parent 95c65d67f5
commit b99182c8d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 166 additions and 78 deletions

View file

@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
}
}
func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
func ModelTTS(
backend,
text,
modelFile,
voice ,
language string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
bb := backend
if bb == "" {
bb = model.PiperBackend
@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})
// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf(res.Message)
}
return filePath, res, err
}

View file

@ -20,6 +20,7 @@ type TTSCMD struct {
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Voice string `short:"v" help:"Voice name to run the TTS"`
Language string `short:"l" help:"Language to use with the TTS"`
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
options := config.BackendConfig{}
options.SetDefaults()
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
if err != nil {
return err
}

View file

@ -15,6 +15,15 @@ const (
RAND_SEED = -1
)
type TTSConfig struct {
// Voice wav path or id
Voice string `yaml:"voice"`
// Vall-e-x
VallE VallE `yaml:"vall-e"`
}
type BackendConfig struct {
schema.PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
@ -49,8 +58,8 @@ type BackendConfig struct {
// GRPC Options
GRPC GRPC `yaml:"grpc"`
// Vall-e-x
VallE VallE `yaml:"vall-e"`
// TTS specifics
TTSConfig `yaml:"tts"`
// CUDA
// Explicitly enable CUDA or not (some backends might need it)

View file

@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
}
log.Debug().Msgf("Request for model: %s", modelFile)
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
if err != nil {
return err
}

View file

@ -12,10 +12,13 @@ import (
)
// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
// @Summary Generates audio from the input text.
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "Response"
// @Router /v1/audio/speech [post]
// @Summary Generates audio from the input text.
// @Accept json
// @Produce audio/x-wav
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "generated audio/wav file"
// @Router /v1/audio/speech [post]
// @Router /tts [post]
func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
)
if err != nil {
log.Err(err)
modelFile = input.Model
log.Warn().Msgf("Model not found in context: %s", input.Model)
} else {
@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
cfg.Backend = input.Backend
}
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
if input.Language != "" {
cfg.Language = input.Language
}
if input.Voice != "" {
cfg.Voice = input.Voice
}
filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
if err != nil {
return err
}

View file

@ -1,59 +1,61 @@
package schema
import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)
type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}
type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}
type TTSRequest struct {
Model string `json:"model" yaml:"model"`
Input string `json:"input" yaml:"input"`
Voice string `json:"voice" yaml:"voice"`
Backend string `json:"backend" yaml:"backend"`
}
type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}
type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys"`
}
type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
}
type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}
type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}
type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
package schema
import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)
type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}
type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}
// @Description TTS request body
type TTSRequest struct {
Model string `json:"model" yaml:"model"` // model name or full path
Input string `json:"input" yaml:"input"` // text input
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
}
type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}
type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys"`
}
type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Keys [][]float32 `json:"keys" yaml:"keys"`
}
type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}
type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`
Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}
type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}