| import os |
| import time |
| import torch |
| import urllib.request |
| import gradio as gr |
| import nltk |
| import numpy as np |
| import soundfile as sf |
| from espnet2.bin.tts_inference import Text2Speech |
| from espnet2.utils.types import str_or_none |
| from pathlib import Path |
| from nltk.tokenize import sent_tokenize |
|
|
| nltk.download('punkt') |
|
|
| gos_text2speech = Text2Speech.from_pretrained( |
| model_tag="bartelds/gos_tts", |
| device="cpu", |
| speed_control_alpha=1.0, |
| noise_scale=1.0, |
| noise_scale_dur=1.0 |
| ) |
|
|
| def inference(text, lang): |
| with torch.no_grad(): |
| lines = sent_tokenize(text.lower()) |
| outputs = [] |
|
|
| for line in lines: |
| line = line.lower() |
| if lang == "Hoogelaandsters": |
| wav = gos_text2speech(line, sids=np.array([1]))["wav"] |
| elif lang == "Oldambsters": |
| wav = gos_text2speech(line, sids=np.array([2]))["wav"] |
| elif lang == "Westerkertaaiers": |
| wav = gos_text2speech(line, sids=np.array([3]))["wav"] |
|
|
| outputs.append(wav) |
|
|
| concatenated_wav = np.concatenate([o.view(-1).cpu().numpy() for o in outputs]) |
| sf.write("out.wav", concatenated_wav, gos_text2speech.fs) |
|
|
| return "out.wav", "out.wav" |
|
|
| title = "Gronings text-to-speech" |
| examples = [ |
| ['Mamme mos even noar winkel om n bosschop.', 'Hoogelaandsters'] |
| ] |
|
|
| gr.Interface( |
| inference, |
| [gr.inputs.Textbox(label="Input text", lines=3), gr.inputs.Radio(choices=["Hoogelaandsters", "Oldambsters", "Westerkertaaiers"], type="value", default="Hoogelaandsters", label="Variant")], |
| [gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()], |
| title=title, |
| examples=examples |
| ).launch(enable_queue=True) |
|
|