| """Text processing utilities for phoneme tokenization.""" |
|
|
| _pad = "$" |
| _punctuation = ';:,.!?¡¿—…"«»"" ' |
| _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' |
| _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" |
|
|
| symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) |
|
|
| _symbol_to_id = {s: i for i, s in enumerate(symbols)} |
|
|
|
|
| class TextCleaner: |
| """Converts phoneme strings to token IDs.""" |
|
|
| def __init__(self): |
| self.word_index_dictionary = _symbol_to_id |
|
|
| def __call__(self, text): |
| indexes = [] |
| for char in text: |
| if char in self.word_index_dictionary: |
| indexes.append(self.word_index_dictionary[char]) |
| return indexes |
|
|