text_utils.py · Seemanth/chiluka-tts at main

chiluka-tts / text_utils.py

Upload Chiluka TTS model

f28049f verified 4 months ago

914 Bytes

	"""Text processing utilities for phoneme tokenization."""

	_pad = "$"
	_punctuation = ';:,.!?¡¿—…"«»"" '
	_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
	_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

	symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

	_symbol_to_id = {s: i for i, s in enumerate(symbols)}


	class TextCleaner:
	"""Converts phoneme strings to token IDs."""

	def __init__(self):
	self.word_index_dictionary = _symbol_to_id

	def __call__(self, text):
	indexes = []
	for char in text:
	if char in self.word_index_dictionary:
	indexes.append(self.word_index_dictionary[char])
	return indexes