From fd68bf708402b3e7206f751e0254ff07df563434 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@users.noreply.github.com>
Date: Sun, 11 Feb 2024 11:20:00 +0100
Subject: [PATCH] fix(vall-e-x): Fix voice cloning (#1696)

---
 backend/python/vall-e-x/ttsvalle.py         | 4 ++++
 docs/content/docs/features/text-to-audio.md | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/backend/python/vall-e-x/ttsvalle.py b/backend/python/vall-e-x/ttsvalle.py
index d7c5d700..fc9d93bd 100644
--- a/backend/python/vall-e-x/ttsvalle.py
+++ b/backend/python/vall-e-x/ttsvalle.py
@@ -55,6 +55,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             print("Preparing models, please wait", file=sys.stderr)
             # download and load all models
             preload_models()
+            self.clonedVoice = False
             # Assume directory from request.ModelFile.
             # Only if request.LoraAdapter it's not an absolute path
             if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
@@ -65,6 +66,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
             if request.AudioPath != "":
                 print("Generating model", file=sys.stderr)
                 make_prompt(name=model_name, audio_prompt_path=request.AudioPath)
+                self.clonedVoice = True
                 ### Use given transcript
                 ##make_prompt(name=model_name, audio_prompt_path="paimon_prompt.wav",
                 ##                transcript="Just, what was that? Paimon thought we were gonna get eaten.")
@@ -91,6 +93,8 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
         try:
             audio_array = None
             if model != "":
+                if self.clonedVoice:
+                    model = os.path.basename(request.model)
                 audio_array = generate_audio(request.text, prompt=model)
             else:
                 audio_array = generate_audio(request.text)
diff --git a/docs/content/docs/features/text-to-audio.md b/docs/content/docs/features/text-to-audio.md
index 68dfbaad..57b783ee 100644
--- a/docs/content/docs/features/text-to-audio.md
+++ b/docs/content/docs/features/text-to-audio.md
@@ -144,15 +144,15 @@ parameters:
   model: "cloned-voice"
 vall-e:
   # The path to the audio file to be cloned
-  # relative to the models directory 
-  audio_path: "path-to-wav-source.wav"
+  # relative to the models directory
+  # Max 15s
+  audio_path: "audio-sample.wav"
 ```
 
 Then you can specify the model name in the requests:
 
 ```
 curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
-     "backend": "vall-e-x",
      "model": "cloned-voice",
      "input":"Hello!"
    }' | aplay