diff --git a/backend/python/kokoro/Makefile b/backend/python/kokoro/Makefile new file mode 100644 index 00000000..c0e5169f --- /dev/null +++ b/backend/python/kokoro/Makefile @@ -0,0 +1,20 @@ +.DEFAULT_GOAL := install + +.PHONY: install +install: + bash install.sh + $(MAKE) protogen + +.PHONY: protogen +protogen: backend_pb2_grpc.py backend_pb2.py + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +backend_pb2_grpc.py backend_pb2.py: + bash protogen.sh + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ \ No newline at end of file diff --git a/backend/python/kokoro/backend.py b/backend/python/kokoro/backend.py new file mode 100755 index 00000000..5216111d --- /dev/null +++ b/backend/python/kokoro/backend.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Extra gRPC server for Kokoro models. +""" +from concurrent import futures + +import argparse +import signal +import sys +import os +import time +import backend_pb2 +import backend_pb2_grpc +import soundfile as sf +import grpc + +from models import build_model +from kokoro import generate +import torch + +SAMPLE_RATE = 22050 +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + A gRPC servicer for the backend service. + + This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding. + """ + def Health(self, request, context): + """ + A gRPC method that returns the health status of the backend service. + + Args: + request: A HealthRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + A Reply object that contains the health status of the backend service. + """ + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + """ + A gRPC method that loads a model into memory. + + Args: + request: A LoadModelRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + A Result object that contains the result of the LoadModel operation. + """ + model_name = request.Model + try: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.MODEL = build_model(request.ModelFile, device) + options = request.Options + # Find the voice from the options, options are a list of strings in this form optname:optvalue: + VOICE_NAME = None + for opt in options: + if opt.startswith("voice:"): + VOICE_NAME = opt.split(":")[1] + break + if VOICE_NAME is None: + return backend_pb2.Result(success=False, message=f"No voice specified in options") + MODELPATH = request.ModelPath + self.VOICE_NAME = VOICE_NAME + self.VOICEPACK = torch.load(f'{MODELPATH}/{VOICE_NAME}.pt', weights_only=True).to(device) + print(f'Loaded voice: {VOICE_NAME}') + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def TTS(self, request, context): + model_name = request.model + if model_name == "": + return backend_pb2.Result(success=False, message="request.model is required") + try: + audio, out_ps = generate(self.MODEL, request.text, self.VOICEPACK, lang=self.VOICE_NAME) + print(out_ps) + sf.write(request.dst, audio, SAMPLE_RATE) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + return backend_pb2.Result(success=True) + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("[Kokoro] Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("[Kokoro] Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + print(f"[Kokoro] startup: {args}", file=sys.stderr) + serve(args.addr) diff --git a/backend/python/kokoro/install.sh b/backend/python/kokoro/install.sh new file mode 100755 index 00000000..36443ef1 --- /dev/null +++ b/backend/python/kokoro/install.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e + +source $(dirname $0)/../common/libbackend.sh + +# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links. +# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match. +# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index +# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index +if [ "x${BUILD_PROFILE}" == "xintel" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +fi + +installRequirements diff --git a/backend/python/kokoro/kokoro.py b/backend/python/kokoro/kokoro.py new file mode 100644 index 00000000..3a0df7f5 --- /dev/null +++ b/backend/python/kokoro/kokoro.py @@ -0,0 +1,166 @@ +# https://huggingface.co/hexgrad/Kokoro-82M/blob/main/kokoro.py +import phonemizer +import re +import torch +import numpy as np + +def split_num(num): + num = num.group() + if '.' in num: + return num + elif ':' in num: + h, m = [int(n) for n in num.split(':')] + if m == 0: + return f"{h} o'clock" + elif m < 10: + return f'{h} oh {m}' + return f'{h} {m}' + year = int(num[:4]) + if year < 1100 or year % 1000 < 10: + return num + left, right = num[:2], int(num[2:4]) + s = 's' if num.endswith('s') else '' + if 100 <= year % 1000 <= 999: + if right == 0: + return f'{left} hundred{s}' + elif right < 10: + return f'{left} oh {right}{s}' + return f'{left} {right}{s}' + +def flip_money(m): + m = m.group() + bill = 'dollar' if m[0] == '$' else 'pound' + if m[-1].isalpha(): + return f'{m[1:]} {bill}s' + elif '.' not in m: + s = '' if m[1:] == '1' else 's' + return f'{m[1:]} {bill}{s}' + b, c = m[1:].split('.') + s = '' if b == '1' else 's' + c = int(c.ljust(2, '0')) + coins = f"cent{'' if c == 1 else 's'}" if m[0] == '$' else ('penny' if c == 1 else 'pence') + return f'{b} {bill}{s} and {c} {coins}' + +def point_num(num): + a, b = num.group().split('.') + return ' point '.join([a, ' '.join(b)]) + +def normalize_text(text): + text = text.replace(chr(8216), "'").replace(chr(8217), "'") + text = text.replace('«', chr(8220)).replace('»', chr(8221)) + text = text.replace(chr(8220), '"').replace(chr(8221), '"') + text = text.replace('(', '«').replace(')', '»') + for a, b in zip('、。!,:;?', ',.!,:;?'): + text = text.replace(a, b+' ') + text = re.sub(r'[^\S \n]', ' ', text) + text = re.sub(r' +', ' ', text) + text = re.sub(r'(?<=\n) +(?=\n)', '', text) + text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text) + text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text) + text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text) + text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text) + text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text) + text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text) + text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(? 510: + tokens = tokens[:510] + print('Truncated to 510 tokens') + ref_s = voicepack[len(tokens)] + out = forward(model, tokens, ref_s, speed) + ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens) + return out, ps + +def generate_full(model, text, voicepack, lang='a', speed=1, ps=None): + ps = ps or phonemize(text, lang) + tokens = tokenize(ps) + if not tokens: + return None + outs = [] + loop_count = len(tokens)//510 + (1 if len(tokens) % 510 != 0 else 0) + for i in range(loop_count): + ref_s = voicepack[len(tokens[i*510:(i+1)*510])] + out = forward(model, tokens[i*510:(i+1)*510], ref_s, speed) + outs.append(out) + outs = np.concatenate(outs) + ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens) + return outs, ps \ No newline at end of file diff --git a/backend/python/kokoro/models.py b/backend/python/kokoro/models.py new file mode 100644 index 00000000..cf358d9e --- /dev/null +++ b/backend/python/kokoro/models.py @@ -0,0 +1,373 @@ +# https://github.com/yl4579/StyleTTS2/blob/main/models.py +# https://huggingface.co/hexgrad/Kokoro-82M/blob/main/models.py +from istftnet import AdaIN1d, Decoder +from munch import Munch +from pathlib import Path +from plbert import load_plbert +from torch.nn.utils import weight_norm, spectral_norm +import json +import numpy as np +import os +import os.path as osp +import torch +import torch.nn as nn +import torch.nn.functional as F + +class LinearNorm(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): + super(LinearNorm, self).__init__() + self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) + + torch.nn.init.xavier_uniform_( + self.linear_layer.weight, + gain=torch.nn.init.calculate_gain(w_init_gain)) + + def forward(self, x): + return self.linear_layer(x) + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + +class TextEncoder(nn.Module): + def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)): + super().__init__() + self.embedding = nn.Embedding(n_symbols, channels) + + padding = (kernel_size - 1) // 2 + self.cnn = nn.ModuleList() + for _ in range(depth): + self.cnn.append(nn.Sequential( + weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)), + LayerNorm(channels), + actv, + nn.Dropout(0.2), + )) + # self.cnn = nn.Sequential(*self.cnn) + + self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True) + + def forward(self, x, input_lengths, m): + x = self.embedding(x) # [B, T, emb] + x = x.transpose(1, 2) # [B, emb, T] + m = m.to(input_lengths.device).unsqueeze(1) + x.masked_fill_(m, 0.0) + + for c in self.cnn: + x = c(x) + x.masked_fill_(m, 0.0) + + x = x.transpose(1, 2) # [B, T, chn] + + input_lengths = input_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True, enforce_sorted=False) + + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + + x = x.transpose(-1, -2) + x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]]) + + x_pad[:, :, :x.shape[-1]] = x + x = x_pad.to(x.device) + + x.masked_fill_(m, 0.0) + + return x + + def inference(self, x): + x = self.embedding(x) + x = x.transpose(1, 2) + x = self.cnn(x) + x = x.transpose(1, 2) + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + return x + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + + +class UpSample1d(nn.Module): + def __init__(self, layer_type): + super().__init__() + self.layer_type = layer_type + + def forward(self, x): + if self.layer_type == 'none': + return x + else: + return F.interpolate(x, scale_factor=2, mode='nearest') + +class AdainResBlk1d(nn.Module): + def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2), + upsample='none', dropout_p=0.0): + super().__init__() + self.actv = actv + self.upsample_type = upsample + self.upsample = UpSample1d(upsample) + self.learned_sc = dim_in != dim_out + self._build_weights(dim_in, dim_out, style_dim) + self.dropout = nn.Dropout(dropout_p) + + if upsample == 'none': + self.pool = nn.Identity() + else: + self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1)) + + + def _build_weights(self, dim_in, dim_out, style_dim): + self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1)) + self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1)) + self.norm1 = AdaIN1d(style_dim, dim_in) + self.norm2 = AdaIN1d(style_dim, dim_out) + if self.learned_sc: + self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False)) + + def _shortcut(self, x): + x = self.upsample(x) + if self.learned_sc: + x = self.conv1x1(x) + return x + + def _residual(self, x, s): + x = self.norm1(x, s) + x = self.actv(x) + x = self.pool(x) + x = self.conv1(self.dropout(x)) + x = self.norm2(x, s) + x = self.actv(x) + x = self.conv2(self.dropout(x)) + return x + + def forward(self, x, s): + out = self._residual(x, s) + out = (out + self._shortcut(x)) / np.sqrt(2) + return out + +class AdaLayerNorm(nn.Module): + def __init__(self, style_dim, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.fc = nn.Linear(style_dim, channels*2) + + def forward(self, x, s): + x = x.transpose(-1, -2) + x = x.transpose(1, -1) + + h = self.fc(s) + h = h.view(h.size(0), h.size(1), 1) + gamma, beta = torch.chunk(h, chunks=2, dim=1) + gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1) + + + x = F.layer_norm(x, (self.channels,), eps=self.eps) + x = (1 + gamma) * x + beta + return x.transpose(1, -1).transpose(-1, -2) + +class ProsodyPredictor(nn.Module): + + def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1): + super().__init__() + + self.text_encoder = DurationEncoder(sty_dim=style_dim, + d_model=d_hid, + nlayers=nlayers, + dropout=dropout) + + self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True) + self.duration_proj = LinearNorm(d_hid, max_dur) + + self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True) + self.F0 = nn.ModuleList() + self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout)) + self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout)) + self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout)) + + self.N = nn.ModuleList() + self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout)) + self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout)) + self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout)) + + self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) + self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0) + + + def forward(self, texts, style, text_lengths, alignment, m): + d = self.text_encoder(texts, style, text_lengths, m) + + batch_size = d.shape[0] + text_size = d.shape[1] + + # predict duration + input_lengths = text_lengths.cpu().numpy() + x = nn.utils.rnn.pack_padded_sequence( + d, input_lengths, batch_first=True, enforce_sorted=False) + + m = m.to(text_lengths.device).unsqueeze(1) + + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + + x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]]) + + x_pad[:, :x.shape[1], :] = x + x = x_pad.to(x.device) + + duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training)) + + en = (d.transpose(-1, -2) @ alignment) + + return duration.squeeze(-1), en + + def F0Ntrain(self, x, s): + x, _ = self.shared(x.transpose(-1, -2)) + + F0 = x.transpose(-1, -2) + for block in self.F0: + F0 = block(F0, s) + F0 = self.F0_proj(F0) + + N = x.transpose(-1, -2) + for block in self.N: + N = block(N, s) + N = self.N_proj(N) + + return F0.squeeze(1), N.squeeze(1) + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + +class DurationEncoder(nn.Module): + + def __init__(self, sty_dim, d_model, nlayers, dropout=0.1): + super().__init__() + self.lstms = nn.ModuleList() + for _ in range(nlayers): + self.lstms.append(nn.LSTM(d_model + sty_dim, + d_model // 2, + num_layers=1, + batch_first=True, + bidirectional=True, + dropout=dropout)) + self.lstms.append(AdaLayerNorm(sty_dim, d_model)) + + + self.dropout = dropout + self.d_model = d_model + self.sty_dim = sty_dim + + def forward(self, x, style, text_lengths, m): + masks = m.to(text_lengths.device) + + x = x.permute(2, 0, 1) + s = style.expand(x.shape[0], x.shape[1], -1) + x = torch.cat([x, s], axis=-1) + x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0) + + x = x.transpose(0, 1) + input_lengths = text_lengths.cpu().numpy() + x = x.transpose(-1, -2) + + for block in self.lstms: + if isinstance(block, AdaLayerNorm): + x = block(x.transpose(-1, -2), style).transpose(-1, -2) + x = torch.cat([x, s.permute(1, -1, 0)], axis=1) + x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0) + else: + x = x.transpose(-1, -2) + x = nn.utils.rnn.pack_padded_sequence( + x, input_lengths, batch_first=True, enforce_sorted=False) + block.flatten_parameters() + x, _ = block(x) + x, _ = nn.utils.rnn.pad_packed_sequence( + x, batch_first=True) + x = F.dropout(x, p=self.dropout, training=self.training) + x = x.transpose(-1, -2) + + x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]]) + + x_pad[:, :, :x.shape[-1]] = x + x = x_pad.to(x.device) + + return x.transpose(-1, -2) + + def inference(self, x, style): + x = self.embedding(x.transpose(-1, -2)) * np.sqrt(self.d_model) + style = style.expand(x.shape[0], x.shape[1], -1) + x = torch.cat([x, style], axis=-1) + src = self.pos_encoder(x) + output = self.transformer_encoder(src).transpose(0, 1) + return output + + def length_to_mask(self, lengths): + mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths) + mask = torch.gt(mask+1, lengths.unsqueeze(1)) + return mask + +# https://github.com/yl4579/StyleTTS2/blob/main/utils.py +def recursive_munch(d): + if isinstance(d, dict): + return Munch((k, recursive_munch(v)) for k, v in d.items()) + elif isinstance(d, list): + return [recursive_munch(v) for v in d] + else: + return d + +def build_model(path, device): + config = Path(__file__).parent / 'config.json' + assert config.exists(), f'Config path incorrect: config.json not found at {config}' + with open(config, 'r') as r: + args = recursive_munch(json.load(r)) + assert args.decoder.type == 'istftnet', f'Unknown decoder type: {args.decoder.type}' + decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels, + resblock_kernel_sizes = args.decoder.resblock_kernel_sizes, + upsample_rates = args.decoder.upsample_rates, + upsample_initial_channel=args.decoder.upsample_initial_channel, + resblock_dilation_sizes=args.decoder.resblock_dilation_sizes, + upsample_kernel_sizes=args.decoder.upsample_kernel_sizes, + gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size) + text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token) + predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout) + bert = load_plbert() + bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim) + for parent in [bert, bert_encoder, predictor, decoder, text_encoder]: + for child in parent.children(): + if isinstance(child, nn.RNNBase): + child.flatten_parameters() + model = Munch( + bert=bert.to(device).eval(), + bert_encoder=bert_encoder.to(device).eval(), + predictor=predictor.to(device).eval(), + decoder=decoder.to(device).eval(), + text_encoder=text_encoder.to(device).eval(), + ) + for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items(): + assert key in model, key + try: + model[key].load_state_dict(state_dict) + except: + state_dict = {k[7:]: v for k, v in state_dict.items()} + model[key].load_state_dict(state_dict, strict=False) + return model diff --git a/backend/python/kokoro/protogen.sh b/backend/python/kokoro/protogen.sh new file mode 100644 index 00000000..32f39fbb --- /dev/null +++ b/backend/python/kokoro/protogen.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +source $(dirname $0)/../common/libbackend.sh + +python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file diff --git a/backend/python/kokoro/requirements-cpu.txt b/backend/python/kokoro/requirements-cpu.txt new file mode 100644 index 00000000..b4f1261f --- /dev/null +++ b/backend/python/kokoro/requirements-cpu.txt @@ -0,0 +1,2 @@ +torch==2.4.1 +transformers \ No newline at end of file diff --git a/backend/python/kokoro/requirements-cublas11.txt b/backend/python/kokoro/requirements-cublas11.txt new file mode 100644 index 00000000..ed0d4df5 --- /dev/null +++ b/backend/python/kokoro/requirements-cublas11.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch==2.4.1+cu118 +transformers \ No newline at end of file diff --git a/backend/python/kokoro/requirements-cublas12.txt b/backend/python/kokoro/requirements-cublas12.txt new file mode 100644 index 00000000..b4f1261f --- /dev/null +++ b/backend/python/kokoro/requirements-cublas12.txt @@ -0,0 +1,2 @@ +torch==2.4.1 +transformers \ No newline at end of file diff --git a/backend/python/kokoro/requirements-hipblas.txt b/backend/python/kokoro/requirements-hipblas.txt new file mode 100644 index 00000000..ec8d0306 --- /dev/null +++ b/backend/python/kokoro/requirements-hipblas.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.0 +torch==2.4.1+rocm6.0 +transformers \ No newline at end of file diff --git a/backend/python/kokoro/requirements-intel.txt b/backend/python/kokoro/requirements-intel.txt new file mode 100644 index 00000000..b16448d3 --- /dev/null +++ b/backend/python/kokoro/requirements-intel.txt @@ -0,0 +1,5 @@ +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +intel-extension-for-pytorch==2.3.110+xpu +torch==2.3.1+cxx11.abi +oneccl_bind_pt==2.3.100+xpu +transformers \ No newline at end of file diff --git a/backend/python/kokoro/requirements.txt b/backend/python/kokoro/requirements.txt new file mode 100644 index 00000000..75d65ba1 --- /dev/null +++ b/backend/python/kokoro/requirements.txt @@ -0,0 +1,7 @@ +grpcio==1.69.0 +protobuf +phonemizer +scipy +munch +setuptools +soundfile \ No newline at end of file diff --git a/backend/python/kokoro/run.sh b/backend/python/kokoro/run.sh new file mode 100755 index 00000000..375c07e5 --- /dev/null +++ b/backend/python/kokoro/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +source $(dirname $0)/../common/libbackend.sh + +startBackend $@ \ No newline at end of file diff --git a/backend/python/kokoro/test.sh b/backend/python/kokoro/test.sh new file mode 100755 index 00000000..6940b066 --- /dev/null +++ b/backend/python/kokoro/test.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +source $(dirname $0)/../common/libbackend.sh + +runUnittests