kokoro-georgian

Kokoro-82M fine-tuned for Georgian (ქართული) text-to-speech.

Quick Start

Requirements

# System
apt-get install espeak-ng

# Python
pip install torch torchaudio soundfile pyyaml munch huggingface_hub

# Patched StyleTTS2 fork (required for model loading)
git clone https://github.com/semidark/StyleTTS2.git

Inference

import subprocess, torch, soundfile as sf, torchaudio, random, sys
from pathlib import Path
from huggingface_hub import hf_hub_download

# 1. Download checkpoint
ckpt_path = hf_hub_download("NMikka/kokoro-georgian", "kokoro_georgian_e9.pth")

# 2. Add StyleTTS2 to path (cloned above)
sys.path.insert(0, "StyleTTS2")
from models import build_model, load_ASR_models, load_F0_models
from Utils.PLBERT.util import load_plbert
from kokoro_symbols import TextCleaner
from kokoro_tb_utils import run_kokoro_inference
import yaml
from munch import Munch

# 3. Load model
if not hasattr(torch, "_original_load"):
    torch._original_load = torch.load
    torch.load = lambda *a, **kw: torch._original_load(*a, **{**kw, "weights_only": False})

config_path = hf_hub_download("NMikka/kokoro-georgian", "config_georgian.yml")
cfg = yaml.safe_load(open(config_path))

def munchify(d):
    return Munch({k: munchify(v) for k, v in d.items()}) if isinstance(d, dict) else d

device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = munchify(cfg["model_params"])
text_aligner  = load_ASR_models("StyleTTS2/Utils/ASR/epoch_00080.pth", "StyleTTS2/Utils/ASR/config.yml")
pitch_extractor = load_F0_models("StyleTTS2/Utils/JDC/bst.t7")
plbert = load_plbert("StyleTTS2/Utils/PLBERT/")

model = build_model(model_params, text_aligner, pitch_extractor, plbert)
ckpt = torch.load(ckpt_path, map_location="cpu")
state = ckpt.get("net", ckpt)
for name, module in model.items():
    if name not in state:
        continue
    sd = state[name]
    if any(k.startswith("module.") for k in sd.keys()):
        sd = {k.removeprefix("module."): v for k, v in sd.items()}
    module.load_state_dict(sd, strict=False)
model = Munch({k: v.to(device).eval() for k, v in model.items()})

# 4. Extract voicepack from reference audio (a directory of 24kHz mono WAVs)
mel_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=24000, n_fft=2048, win_length=1200, hop_length=300, n_mels=80
).to(device)

def extract_voicepack(audio_dir, n=50):
    wavs = random.sample(list(Path(audio_dir).rglob("*.wav")), min(n, len(list(Path(audio_dir).rglob("*.wav")))))
    acoustic, prosodic = [], []
    with torch.no_grad():
        for w in wavs:
            import soundfile as sf
            data, sr = sf.read(str(w), dtype="float32")
            if data.ndim > 1: data = data.mean(axis=1)
            wav = torch.from_numpy(data).unsqueeze(0)
            if sr != 24000: wav = torchaudio.functional.resample(wav, sr, 24000)
            wav = wav.to(device)
            mel = ((mel_transform(wav) + 1e-5).log2() - (-4)) / 4
            if mel.shape[-1] < 10: continue
            acoustic.append(model.style_encoder(mel.unsqueeze(1)))
            prosodic.append(model.predictor_encoder(mel.unsqueeze(1)))
    return torch.cat([torch.stack(acoustic).mean(0), torch.stack(prosodic).mean(0)], dim=-1)

voicepack = extract_voicepack("/path/to/reference/audio/")

# 5. Georgian G2P via espeak-ng
def ka_g2p(text):
    r = subprocess.run(["espeak-ng", "-v", "ka", "--ipa=3", "-q"],
                       input=text, capture_output=True, text=True)
    ipa = " ".join(l.strip() for l in r.stdout.splitlines() if l.strip())
    for tied, lig in {"d‍ʒ":"ʥ","t‍ʃ":"ʨ","t‍s":"ʦ","d‍z":"ʣ"}.items():
        ipa = ipa.replace(tied, lig)
    return ipa.replace("‍", "")

# 6. Synthesise
text = "გამარჯობა, სამყარო!"
ipa = ka_g2p(text)
tc = TextCleaner()
audios = run_kokoro_inference(model, [(text, tc(ipa))], voicepack, device, tc)
wav = audios[0][1].cpu().numpy().squeeze()
sf.write("output.wav", wav, 24000)

Or use the bundled script:

python infer.py \
  --text "გამარჯობა, სამყარო!" \
  --checkpoint kokoro_georgian_e9.pth \
  --audio-ref /path/to/reference/speaker/wavs/ \
  --styletts2-dir ./StyleTTS2 \
  --output output.wav

Training

Base model: hexgrad/Kokoro-82M (82M params, StyleTTS2-based)
Training data: NMikka/Common-Voice-Geo-Cleaned — 35 hours, 12 speakers
G2P: espeak-ng -v ka with ZWJ-stripping and affricate ligature remapping
Phoneme coverage: 100% of CV-Geo IPA within Kokoro's 178-token vocabulary (verified on 21k sentences)
Stage 1: Decoder + alignment training (8 epochs, val Mel loss 0.303)
Stage 2: Prosody predictor + WavLM adversarial training (10 epochs, best Dur 0.483, best F0 2.336)
Training framework: semidark/StyleTTS2 (patched fork with multispeaker Stage-2 fixes)
Training code: NMikaa/TTS_pipelines

Status

Evaluation in progress. FLEURS-KA round-trip CER/WER benchmark pending.

License

Apache 2.0 (inherited from Kokoro-82M base model).

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support