YAML Metadata Warning:empty or missing yaml metadata in repo card
Check out the documentation for more information.
kokoro-georgian
Kokoro-82M fine-tuned for Georgian (α₯αα αα£αα) text-to-speech.
Quick Start
Requirements
# System
apt-get install espeak-ng
# Python
pip install torch torchaudio soundfile pyyaml munch huggingface_hub
# Patched StyleTTS2 fork (required for model loading)
git clone https://github.com/semidark/StyleTTS2.git
Inference
import subprocess, torch, soundfile as sf, torchaudio, random, sys
from pathlib import Path
from huggingface_hub import hf_hub_download
# 1. Download checkpoint
ckpt_path = hf_hub_download("NMikka/kokoro-georgian", "kokoro_georgian_e9.pth")
# 2. Add StyleTTS2 to path (cloned above)
sys.path.insert(0, "StyleTTS2")
from models import build_model, load_ASR_models, load_F0_models
from Utils.PLBERT.util import load_plbert
from kokoro_symbols import TextCleaner
from kokoro_tb_utils import run_kokoro_inference
import yaml
from munch import Munch
# 3. Load model
if not hasattr(torch, "_original_load"):
torch._original_load = torch.load
torch.load = lambda *a, **kw: torch._original_load(*a, **{**kw, "weights_only": False})
config_path = hf_hub_download("NMikka/kokoro-georgian", "config_georgian.yml")
cfg = yaml.safe_load(open(config_path))
def munchify(d):
return Munch({k: munchify(v) for k, v in d.items()}) if isinstance(d, dict) else d
device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = munchify(cfg["model_params"])
text_aligner = load_ASR_models("StyleTTS2/Utils/ASR/epoch_00080.pth", "StyleTTS2/Utils/ASR/config.yml")
pitch_extractor = load_F0_models("StyleTTS2/Utils/JDC/bst.t7")
plbert = load_plbert("StyleTTS2/Utils/PLBERT/")
model = build_model(model_params, text_aligner, pitch_extractor, plbert)
ckpt = torch.load(ckpt_path, map_location="cpu")
state = ckpt.get("net", ckpt)
for name, module in model.items():
if name not in state:
continue
sd = state[name]
if any(k.startswith("module.") for k in sd.keys()):
sd = {k.removeprefix("module."): v for k, v in sd.items()}
module.load_state_dict(sd, strict=False)
model = Munch({k: v.to(device).eval() for k, v in model.items()})
# 4. Extract voicepack from reference audio (a directory of 24kHz mono WAVs)
mel_transform = torchaudio.transforms.MelSpectrogram(
sample_rate=24000, n_fft=2048, win_length=1200, hop_length=300, n_mels=80
).to(device)
def extract_voicepack(audio_dir, n=50):
wavs = random.sample(list(Path(audio_dir).rglob("*.wav")), min(n, len(list(Path(audio_dir).rglob("*.wav")))))
acoustic, prosodic = [], []
with torch.no_grad():
for w in wavs:
import soundfile as sf
data, sr = sf.read(str(w), dtype="float32")
if data.ndim > 1: data = data.mean(axis=1)
wav = torch.from_numpy(data).unsqueeze(0)
if sr != 24000: wav = torchaudio.functional.resample(wav, sr, 24000)
wav = wav.to(device)
mel = ((mel_transform(wav) + 1e-5).log2() - (-4)) / 4
if mel.shape[-1] < 10: continue
acoustic.append(model.style_encoder(mel.unsqueeze(1)))
prosodic.append(model.predictor_encoder(mel.unsqueeze(1)))
return torch.cat([torch.stack(acoustic).mean(0), torch.stack(prosodic).mean(0)], dim=-1)
voicepack = extract_voicepack("/path/to/reference/audio/")
# 5. Georgian G2P via espeak-ng
def ka_g2p(text):
r = subprocess.run(["espeak-ng", "-v", "ka", "--ipa=3", "-q"],
input=text, capture_output=True, text=True)
ipa = " ".join(l.strip() for l in r.stdout.splitlines() if l.strip())
for tied, lig in {"dβΚ":"Κ₯","tβΚ":"Κ¨","tβs":"Κ¦","dβz":"Κ£"}.items():
ipa = ipa.replace(tied, lig)
return ipa.replace("β", "")
# 6. Synthesise
text = "ααααα α―ααα, α‘ααα§αα α!"
ipa = ka_g2p(text)
tc = TextCleaner()
audios = run_kokoro_inference(model, [(text, tc(ipa))], voicepack, device, tc)
wav = audios[0][1].cpu().numpy().squeeze()
sf.write("output.wav", wav, 24000)
Or use the bundled script:
python infer.py \
--text "ααααα α―ααα, α‘ααα§αα α!" \
--checkpoint kokoro_georgian_e9.pth \
--audio-ref /path/to/reference/speaker/wavs/ \
--styletts2-dir ./StyleTTS2 \
--output output.wav
Training
- Base model: hexgrad/Kokoro-82M (82M params, StyleTTS2-based)
- Training data: NMikka/Common-Voice-Geo-Cleaned β 35 hours, 12 speakers
- G2P:
espeak-ng -v kawith ZWJ-stripping and affricate ligature remapping - Phoneme coverage: 100% of CV-Geo IPA within Kokoro's 178-token vocabulary (verified on 21k sentences)
- Stage 1: Decoder + alignment training (8 epochs, val Mel loss 0.303)
- Stage 2: Prosody predictor + WavLM adversarial training (10 epochs, best Dur 0.483, best F0 2.336)
- Training framework: semidark/StyleTTS2 (patched fork with multispeaker Stage-2 fixes)
- Training code: NMikaa/TTS_pipelines
Status
Evaluation in progress. FLEURS-KA round-trip CER/WER benchmark pending.
License
Apache 2.0 (inherited from Kokoro-82M base model).
Inference Providers NEW
This model isn't deployed by any Inference Provider. π Ask for provider support