Spaces:
Running
Running
Vineel Pratap
commited on
Commit
·
78e8beb
1
Parent(s):
d15da79
norm
Browse files- app.py +32 -8
- normalization/README.txt +3 -0
- normalization/__init__.py +0 -0
- normalization/norm_config.py +276 -0
- normalization/punctuations.lst +188 -0
- normalization/text_norm.py +92 -0
- zeroshot.py +5 -4
app.py
CHANGED
|
@@ -84,7 +84,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 84 |
with gr.Accordion("Logs", open=False):
|
| 85 |
logs = gr.Textbox(show_label=False)
|
| 86 |
|
| 87 |
-
# hack
|
| 88 |
reference = gr.Textbox(label="Reference Transcript", visible=False)
|
| 89 |
|
| 90 |
btn.click(
|
|
@@ -97,7 +97,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 97 |
lmscore,
|
| 98 |
wscore_usedefault,
|
| 99 |
lmscore_usedefault,
|
| 100 |
-
reference
|
| 101 |
],
|
| 102 |
outputs=[text, logs],
|
| 103 |
)
|
|
@@ -106,9 +106,21 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 106 |
gr.Examples(
|
| 107 |
examples=[
|
| 108 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
| 109 |
-
[
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
],
|
| 113 |
inputs=[audio, words_file, reference],
|
| 114 |
label="English",
|
|
@@ -116,9 +128,21 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 116 |
gr.Examples(
|
| 117 |
examples=[
|
| 118 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
| 119 |
-
[
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
],
|
| 123 |
inputs=[audio, words_file, reference],
|
| 124 |
label="Ligurian",
|
|
|
|
| 84 |
with gr.Accordion("Logs", open=False):
|
| 85 |
logs = gr.Textbox(show_label=False)
|
| 86 |
|
| 87 |
+
# hack
|
| 88 |
reference = gr.Textbox(label="Reference Transcript", visible=False)
|
| 89 |
|
| 90 |
btn.click(
|
|
|
|
| 97 |
lmscore,
|
| 98 |
wscore_usedefault,
|
| 99 |
lmscore_usedefault,
|
| 100 |
+
reference,
|
| 101 |
],
|
| 102 |
outputs=[text, logs],
|
| 103 |
)
|
|
|
|
| 106 |
gr.Examples(
|
| 107 |
examples=[
|
| 108 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
| 109 |
+
[
|
| 110 |
+
"upload/english/english.mp3",
|
| 111 |
+
"upload/english/c4_10k_sentences.txt",
|
| 112 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
| 113 |
+
],
|
| 114 |
+
[
|
| 115 |
+
"upload/english/english.mp3",
|
| 116 |
+
"upload/english/c4_5k_sentences.txt",
|
| 117 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
| 118 |
+
],
|
| 119 |
+
[
|
| 120 |
+
"upload/english/english.mp3",
|
| 121 |
+
"upload/english/cv8_top10k_words.txt",
|
| 122 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
| 123 |
+
],
|
| 124 |
],
|
| 125 |
inputs=[audio, words_file, reference],
|
| 126 |
label="English",
|
|
|
|
| 128 |
gr.Examples(
|
| 129 |
examples=[
|
| 130 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
| 131 |
+
[
|
| 132 |
+
"upload/ligurian/ligurian_1.mp3",
|
| 133 |
+
"upload/ligurian/zenamt_10k_sentences.txt",
|
| 134 |
+
"I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña.",
|
| 135 |
+
],
|
| 136 |
+
[
|
| 137 |
+
"upload/ligurian/ligurian_2.mp3",
|
| 138 |
+
"upload/ligurian/zenamt_10k_sentences.txt",
|
| 139 |
+
"Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato.",
|
| 140 |
+
],
|
| 141 |
+
[
|
| 142 |
+
"upload/ligurian/ligurian_3.mp3",
|
| 143 |
+
"upload/ligurian/zenamt_5k_sentences.txt",
|
| 144 |
+
"Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ.",
|
| 145 |
+
],
|
| 146 |
],
|
| 147 |
inputs=[audio, words_file, reference],
|
| 148 |
label="Ligurian",
|
normalization/README.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a6aa5ef11df920fccc933f0d0ff4dd982a2872e0e544ab7409507ad6f130b81
|
| 3 |
+
size 118
|
normalization/__init__.py
ADDED
|
File without changes
|
normalization/norm_config.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
colon = ":"
|
| 6 |
+
comma = ","
|
| 7 |
+
exclamation_mark = "!"
|
| 8 |
+
period = re.escape(".")
|
| 9 |
+
question_mark = re.escape("?")
|
| 10 |
+
semicolon = ";"
|
| 11 |
+
|
| 12 |
+
left_curly_bracket = "{"
|
| 13 |
+
right_curly_bracket = "}"
|
| 14 |
+
quotation_mark = '"'
|
| 15 |
+
|
| 16 |
+
basic_punc = (
|
| 17 |
+
period
|
| 18 |
+
+ question_mark
|
| 19 |
+
+ comma
|
| 20 |
+
+ colon
|
| 21 |
+
+ exclamation_mark
|
| 22 |
+
+ left_curly_bracket
|
| 23 |
+
+ right_curly_bracket
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# General punc unicode block (0x2000-0x206F)
|
| 27 |
+
zero_width_space = r"\u200B"
|
| 28 |
+
zero_width_nonjoiner = r"\u200C"
|
| 29 |
+
left_to_right_mark = r"\u200E"
|
| 30 |
+
right_to_left_mark = r"\u200F"
|
| 31 |
+
left_to_right_embedding = r"\u202A"
|
| 32 |
+
pop_directional_formatting = r"\u202C"
|
| 33 |
+
|
| 34 |
+
# Here are some commonly ill-typed versions of apostrophe
|
| 35 |
+
right_single_quotation_mark = r"\u2019"
|
| 36 |
+
left_single_quotation_mark = r"\u2018"
|
| 37 |
+
|
| 38 |
+
# Language specific definitions
|
| 39 |
+
# Spanish
|
| 40 |
+
inverted_exclamation_mark = r"\u00A1"
|
| 41 |
+
inverted_question_mark = r"\u00BF"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Hindi
|
| 45 |
+
hindi_danda = u"\u0964"
|
| 46 |
+
|
| 47 |
+
# Egyptian Arabic
|
| 48 |
+
# arabic_percent = r"\u066A"
|
| 49 |
+
arabic_comma = r"\u060C"
|
| 50 |
+
arabic_question_mark = r"\u061F"
|
| 51 |
+
arabic_semicolon = r"\u061B"
|
| 52 |
+
arabic_diacritics = r"\u064B-\u0652"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# Chinese
|
| 59 |
+
full_stop = r"\u3002"
|
| 60 |
+
full_comma = r"\uFF0C"
|
| 61 |
+
full_exclamation_mark = r"\uFF01"
|
| 62 |
+
full_question_mark = r"\uFF1F"
|
| 63 |
+
full_semicolon = r"\uFF1B"
|
| 64 |
+
full_colon = r"\uFF1A"
|
| 65 |
+
full_parentheses = r"\uFF08\uFF09"
|
| 66 |
+
quotation_mark_horizontal = r"\u300C-\u300F"
|
| 67 |
+
quotation_mark_vertical = r"\uFF41-\uFF44"
|
| 68 |
+
title_marks = r"\u3008-\u300B"
|
| 69 |
+
wavy_low_line = r"\uFE4F"
|
| 70 |
+
ellipsis = r"\u22EF"
|
| 71 |
+
enumeration_comma = r"\u3001"
|
| 72 |
+
hyphenation_point = r"\u2027"
|
| 73 |
+
forward_slash = r"\uFF0F"
|
| 74 |
+
wavy_dash = r"\uFF5E"
|
| 75 |
+
box_drawings_light_horizontal = r"\u2500"
|
| 76 |
+
fullwidth_low_line = r"\uFF3F"
|
| 77 |
+
chinese_punc = (
|
| 78 |
+
full_stop
|
| 79 |
+
+ full_comma
|
| 80 |
+
+ full_exclamation_mark
|
| 81 |
+
+ full_question_mark
|
| 82 |
+
+ full_semicolon
|
| 83 |
+
+ full_colon
|
| 84 |
+
+ full_parentheses
|
| 85 |
+
+ quotation_mark_horizontal
|
| 86 |
+
+ quotation_mark_vertical
|
| 87 |
+
+ title_marks
|
| 88 |
+
+ wavy_low_line
|
| 89 |
+
+ ellipsis
|
| 90 |
+
+ enumeration_comma
|
| 91 |
+
+ hyphenation_point
|
| 92 |
+
+ forward_slash
|
| 93 |
+
+ wavy_dash
|
| 94 |
+
+ box_drawings_light_horizontal
|
| 95 |
+
+ fullwidth_low_line
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Armenian
|
| 99 |
+
armenian_apostrophe = r"\u055A"
|
| 100 |
+
emphasis_mark = r"\u055B"
|
| 101 |
+
exclamation_mark = r"\u055C"
|
| 102 |
+
armenian_comma = r"\u055D"
|
| 103 |
+
armenian_question_mark = r"\u055E"
|
| 104 |
+
abbreviation_mark = r"\u055F"
|
| 105 |
+
armenian_full_stop = r"\u0589"
|
| 106 |
+
armenian_punc = (
|
| 107 |
+
armenian_apostrophe
|
| 108 |
+
+ emphasis_mark
|
| 109 |
+
+ exclamation_mark
|
| 110 |
+
+ armenian_comma
|
| 111 |
+
+ armenian_question_mark
|
| 112 |
+
+ abbreviation_mark
|
| 113 |
+
+ armenian_full_stop
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
lesser_than_symbol = r"<"
|
| 117 |
+
greater_than_symbol = r">"
|
| 118 |
+
|
| 119 |
+
lesser_than_sign = r"\u003c"
|
| 120 |
+
greater_than_sign = r"\u003e"
|
| 121 |
+
|
| 122 |
+
nbsp_written_form = r" "
|
| 123 |
+
|
| 124 |
+
# Quotation marks
|
| 125 |
+
left_double_quotes = r"\u201c"
|
| 126 |
+
right_double_quotes = r"\u201d"
|
| 127 |
+
left_double_angle = r"\u00ab"
|
| 128 |
+
right_double_angle = r"\u00bb"
|
| 129 |
+
left_single_angle = r"\u2039"
|
| 130 |
+
right_single_angle = r"\u203a"
|
| 131 |
+
low_double_quotes = r"\u201e"
|
| 132 |
+
low_single_quotes = r"\u201a"
|
| 133 |
+
high_double_quotes = r"\u201f"
|
| 134 |
+
high_single_quotes = r"\u201b"
|
| 135 |
+
|
| 136 |
+
all_punct_quotes = (
|
| 137 |
+
left_double_quotes
|
| 138 |
+
+ right_double_quotes
|
| 139 |
+
+ left_double_angle
|
| 140 |
+
+ right_double_angle
|
| 141 |
+
+ left_single_angle
|
| 142 |
+
+ right_single_angle
|
| 143 |
+
+ low_double_quotes
|
| 144 |
+
+ low_single_quotes
|
| 145 |
+
+ high_double_quotes
|
| 146 |
+
+ high_single_quotes
|
| 147 |
+
+ right_single_quotation_mark
|
| 148 |
+
+ left_single_quotation_mark
|
| 149 |
+
)
|
| 150 |
+
mapping_quotes = (
|
| 151 |
+
"["
|
| 152 |
+
+ high_single_quotes
|
| 153 |
+
+ right_single_quotation_mark
|
| 154 |
+
+ left_single_quotation_mark
|
| 155 |
+
+ "]"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# Digits
|
| 160 |
+
|
| 161 |
+
english_digits = r"\u0030-\u0039"
|
| 162 |
+
bengali_digits = r"\u09e6-\u09ef"
|
| 163 |
+
khmer_digits = r"\u17e0-\u17e9"
|
| 164 |
+
devanagari_digits = r"\u0966-\u096f"
|
| 165 |
+
oriya_digits = r"\u0b66-\u0b6f"
|
| 166 |
+
extended_arabic_indic_digits = r"\u06f0-\u06f9"
|
| 167 |
+
kayah_li_digits = r"\ua900-\ua909"
|
| 168 |
+
fullwidth_digits = r"\uff10-\uff19"
|
| 169 |
+
malayam_digits = r"\u0d66-\u0d6f"
|
| 170 |
+
myanmar_digits = r"\u1040-\u1049"
|
| 171 |
+
roman_numeral = r"\u2170-\u2179"
|
| 172 |
+
nominal_digit_shapes = r"\u206f"
|
| 173 |
+
|
| 174 |
+
# Load punctuations from MMS-lab data
|
| 175 |
+
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
| 176 |
+
punc_list = punc_f.readlines()
|
| 177 |
+
|
| 178 |
+
punct_pattern = r""
|
| 179 |
+
for punc in punc_list:
|
| 180 |
+
# the first character in the tab separated line is the punc to be removed
|
| 181 |
+
punct_pattern += re.escape(punc.split("\t")[0])
|
| 182 |
+
|
| 183 |
+
shared_digits = (
|
| 184 |
+
english_digits
|
| 185 |
+
+ bengali_digits
|
| 186 |
+
+ khmer_digits
|
| 187 |
+
+ devanagari_digits
|
| 188 |
+
+ oriya_digits
|
| 189 |
+
+ extended_arabic_indic_digits
|
| 190 |
+
+ kayah_li_digits
|
| 191 |
+
+ fullwidth_digits
|
| 192 |
+
+ malayam_digits
|
| 193 |
+
+ myanmar_digits
|
| 194 |
+
+ roman_numeral
|
| 195 |
+
+ nominal_digit_shapes
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
shared_punc_list = (
|
| 199 |
+
basic_punc
|
| 200 |
+
+ all_punct_quotes
|
| 201 |
+
+ greater_than_sign
|
| 202 |
+
+ lesser_than_sign
|
| 203 |
+
+ inverted_question_mark
|
| 204 |
+
+ full_stop
|
| 205 |
+
+ semicolon
|
| 206 |
+
+ armenian_punc
|
| 207 |
+
+ inverted_exclamation_mark
|
| 208 |
+
+ arabic_comma
|
| 209 |
+
+ enumeration_comma
|
| 210 |
+
+ hindi_danda
|
| 211 |
+
+ quotation_mark
|
| 212 |
+
+ arabic_semicolon
|
| 213 |
+
+ arabic_question_mark
|
| 214 |
+
+ chinese_punc
|
| 215 |
+
+ punct_pattern
|
| 216 |
+
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
shared_mappping = {
|
| 220 |
+
lesser_than_symbol: "",
|
| 221 |
+
greater_than_symbol: "",
|
| 222 |
+
nbsp_written_form: "",
|
| 223 |
+
r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
shared_deletion_list = (
|
| 227 |
+
left_to_right_mark
|
| 228 |
+
+ zero_width_nonjoiner
|
| 229 |
+
+ arabic_subscript_alef_and_inverted_damma
|
| 230 |
+
+ zero_width_space
|
| 231 |
+
+ arabic_diacritics
|
| 232 |
+
+ pop_directional_formatting
|
| 233 |
+
+ right_to_left_mark
|
| 234 |
+
+ left_to_right_embedding
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
norm_config = {
|
| 238 |
+
"*": {
|
| 239 |
+
"lower_case": True,
|
| 240 |
+
"punc_set": shared_punc_list,
|
| 241 |
+
"del_set": shared_deletion_list,
|
| 242 |
+
"mapping": shared_mappping,
|
| 243 |
+
"digit_set": shared_digits,
|
| 244 |
+
"unicode_norm": "NFKC",
|
| 245 |
+
"rm_diacritics" : False,
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
#=============== Mongolian ===============#
|
| 250 |
+
|
| 251 |
+
norm_config["mon"] = norm_config["*"].copy()
|
| 252 |
+
# add soft hyphen to punc list to match with fleurs
|
| 253 |
+
norm_config["mon"]["del_set"] += r"\u00AD"
|
| 254 |
+
|
| 255 |
+
norm_config["khk"] = norm_config["mon"].copy()
|
| 256 |
+
|
| 257 |
+
#=============== Hebrew ===============#
|
| 258 |
+
|
| 259 |
+
norm_config["heb"] = norm_config["*"].copy()
|
| 260 |
+
# add "HEBREW POINT" symbols to match with fleurs
|
| 261 |
+
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
| 262 |
+
|
| 263 |
+
#=============== Thai ===============#
|
| 264 |
+
|
| 265 |
+
norm_config["tha"] = norm_config["*"].copy()
|
| 266 |
+
# add "Zero width joiner" symbols to match with fleurs
|
| 267 |
+
norm_config["tha"]["punc_set"] += r"\u200D"
|
| 268 |
+
|
| 269 |
+
#=============== Arabic ===============#
|
| 270 |
+
norm_config["ara"] = norm_config["*"].copy()
|
| 271 |
+
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
| 272 |
+
norm_config["arb"] = norm_config["ara"].copy()
|
| 273 |
+
|
| 274 |
+
#=============== Javanese ===============#
|
| 275 |
+
norm_config["jav"] = norm_config["*"].copy()
|
| 276 |
+
norm_config["jav"]["rm_diacritics"] = True
|
normalization/punctuations.lst
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
7355 INVALID UNICODE 0x81
|
| 2 |
+
5265 INVALID UNICODE 0x90
|
| 3 |
+
75 INVALID UNICODE 0x8
|
| 4 |
+
31 INVALID UNICODE 0x8d
|
| 5 |
+
3 INVALID UNICODE 0x94
|
| 6 |
+
2 INVALID UNICODE 0x8f
|
| 7 |
+
2 INVALID UNICODE 0x1a
|
| 8 |
+
1 INVALID UNICODE 0x9d
|
| 9 |
+
1 INVALID UNICODE 0x93
|
| 10 |
+
1 INVALID UNICODE 0x92
|
| 11 |
+
8647 INVALID UNICODE 0xe295
|
| 12 |
+
6650 INVALID UNICODE 0xf21d
|
| 13 |
+
6234 INVALID UNICODE 0xf62d
|
| 14 |
+
4815 INVALID UNICODE 0xf173
|
| 15 |
+
4789 INVALID UNICODE 0xe514
|
| 16 |
+
4409 INVALID UNICODE 0xe293
|
| 17 |
+
3881 INVALID UNICODE 0xf523
|
| 18 |
+
3788 INVALID UNICODE 0xe233
|
| 19 |
+
2448 INVALID UNICODE 0xf50f
|
| 20 |
+
2177 INVALID UNICODE 0xe232
|
| 21 |
+
1955 INVALID UNICODE 0xea7b
|
| 22 |
+
1926 INVALID UNICODE 0xf172
|
| 23 |
+
973 INVALID UNICODE 0xe290
|
| 24 |
+
972 INVALID UNICODE 0xf519
|
| 25 |
+
661 INVALID UNICODE 0xe292
|
| 26 |
+
591 INVALID UNICODE 0xe328
|
| 27 |
+
509 INVALID UNICODE 0xe2fa
|
| 28 |
+
458 INVALID UNICODE 0xe234
|
| 29 |
+
446 INVALID UNICODE 0xe043
|
| 30 |
+
419 INVALID UNICODE 0xe040
|
| 31 |
+
399 INVALID UNICODE 0xe2fb
|
| 32 |
+
387 INVALID UNICODE 0xe32b
|
| 33 |
+
381 INVALID UNICODE 0xe236
|
| 34 |
+
374 INVALID UNICODE 0xf511
|
| 35 |
+
314 INVALID UNICODE 0xe517
|
| 36 |
+
296 INVALID UNICODE 0xe2fe
|
| 37 |
+
293 INVALID UNICODE 0xe492
|
| 38 |
+
291 INVALID UNICODE 0xf52d
|
| 39 |
+
289 INVALID UNICODE 0xe2fc
|
| 40 |
+
195 INVALID UNICODE 0xf521
|
| 41 |
+
190 INVALID UNICODE 0xe516
|
| 42 |
+
182 INVALID UNICODE 0xe041
|
| 43 |
+
178 INVALID UNICODE 0xf529
|
| 44 |
+
113 INVALID UNICODE 0xe2f9
|
| 45 |
+
87 INVALID UNICODE 0xe2d9
|
| 46 |
+
78 INVALID UNICODE 0xe32a
|
| 47 |
+
76 INVALID UNICODE 0xe291
|
| 48 |
+
74 INVALID UNICODE 0xe296
|
| 49 |
+
66 INVALID UNICODE 0xe518
|
| 50 |
+
52 INVALID UNICODE 0xe32c
|
| 51 |
+
46 INVALID UNICODE 0xe2db
|
| 52 |
+
41 INVALID UNICODE 0xe231
|
| 53 |
+
34 INVALID UNICODE 0xf522
|
| 54 |
+
33 INVALID UNICODE 0xf518
|
| 55 |
+
32 INVALID UNICODE 0xf513
|
| 56 |
+
27 INVALID UNICODE 0xe32d
|
| 57 |
+
25 INVALID UNICODE 0xe32e
|
| 58 |
+
23 INVALID UNICODE 0xe06b
|
| 59 |
+
15 INVALID UNICODE 0xea01
|
| 60 |
+
12 INVALID UNICODE 0xe294
|
| 61 |
+
11 INVALID UNICODE 0xe203
|
| 62 |
+
8 INVALID UNICODE 0xf218
|
| 63 |
+
7 INVALID UNICODE 0xe070
|
| 64 |
+
7 INVALID UNICODE 0xe013
|
| 65 |
+
5 INVALID UNICODE 0xe2de
|
| 66 |
+
4 INVALID UNICODE 0xe493
|
| 67 |
+
3 INVALID UNICODE 0xf7e8
|
| 68 |
+
3 INVALID UNICODE 0xf7d0
|
| 69 |
+
3 INVALID UNICODE 0xe313
|
| 70 |
+
2 INVALID UNICODE 0xe329
|
| 71 |
+
2 INVALID UNICODE 0xe06d
|
| 72 |
+
2 INVALID UNICODE 0xe003
|
| 73 |
+
1 INVALID UNICODE 0xf50e
|
| 74 |
+
1 INVALID UNICODE 0xf171
|
| 75 |
+
1 INVALID UNICODE 0xe01d
|
| 76 |
+
71 NOMINAL DIGIT SHAPES 0x206f
|
| 77 |
+
3 WORD JOINER 0x2060
|
| 78 |
+
― 126545 HORIZONTAL BAR 0x2015
|
| 79 |
+
־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
|
| 80 |
+
) 98429 RIGHT PARENTHESIS 0x29
|
| 81 |
+
] 27108 RIGHT SQUARE BRACKET 0x5d
|
| 82 |
+
⌋ 1567 RIGHT FLOOR 0x230b
|
| 83 |
+
〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
|
| 84 |
+
】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
|
| 85 |
+
﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
|
| 86 |
+
& 170517 AMPERSAND 0x26
|
| 87 |
+
། 106330 TIBETAN MARK SHAD 0xf0d
|
| 88 |
+
። 90203 ETHIOPIC FULL STOP 0x1362
|
| 89 |
+
፥ 60484 ETHIOPIC COLON 0x1365
|
| 90 |
+
༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
|
| 91 |
+
။ 51567 MYANMAR SIGN SECTION 0x104b
|
| 92 |
+
/ 46929 SOLIDUS 0x2f
|
| 93 |
+
၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
|
| 94 |
+
· 37985 MIDDLE DOT 0xb7
|
| 95 |
+
‸ 36310 CARET 0x2038
|
| 96 |
+
* 34793 ASTERISK 0x2a
|
| 97 |
+
۔ 32432 ARABIC FULL STOP 0x6d4
|
| 98 |
+
፤ 31906 ETHIOPIC SEMICOLON 0x1364
|
| 99 |
+
၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
|
| 100 |
+
។ 20834 KHMER SIGN KHAN 0x17d4
|
| 101 |
+
꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
|
| 102 |
+
᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
|
| 103 |
+
꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
|
| 104 |
+
⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
|
| 105 |
+
꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
|
| 106 |
+
॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
|
| 107 |
+
؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
|
| 108 |
+
၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
|
| 109 |
+
· 8431 GREEK ANO TELEIA 0x387
|
| 110 |
+
† 7477 DAGGER 0x2020
|
| 111 |
+
၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
|
| 112 |
+
፣ 5719 ETHIOPIC COMMA 0x1363
|
| 113 |
+
៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
|
| 114 |
+
꤮ 4791 KAYAH LI SIGN CWI 0xa92e
|
| 115 |
+
※ 3439 REFERENCE MARK 0x203b
|
| 116 |
+
፦ 2727 ETHIOPIC PREFACE COLON 0x1366
|
| 117 |
+
• 1749 BULLET 0x2022
|
| 118 |
+
¶ 1507 PILCROW SIGN 0xb6
|
| 119 |
+
၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
|
| 120 |
+
﹖ 1224 SMALL QUESTION MARK 0xfe56
|
| 121 |
+
; 975 GREEK QUESTION MARK 0x37e
|
| 122 |
+
… 827 HORIZONTAL ELLIPSIS 0x2026
|
| 123 |
+
% 617 PERCENT SIGN 0x25
|
| 124 |
+
・ 468 KATAKANA MIDDLE DOT 0x30fb
|
| 125 |
+
༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
|
| 126 |
+
‡ 140 DOUBLE DAGGER 0x2021
|
| 127 |
+
# 137 NUMBER SIGN 0x23
|
| 128 |
+
@ 125 COMMERCIAL AT 0x40
|
| 129 |
+
፡ 121 ETHIOPIC WORDSPACE 0x1361
|
| 130 |
+
៚ 55 KHMER SIGN KOOMUUT 0x17da
|
| 131 |
+
៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
|
| 132 |
+
﹐ 10 SMALL COMMA 0xfe50
|
| 133 |
+
༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
|
| 134 |
+
༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
|
| 135 |
+
. 2 FULLWIDTH FULL STOP 0xff0e
|
| 136 |
+
﹗ 2 SMALL EXCLAMATION MARK 0xfe57
|
| 137 |
+
﹕ 2 SMALL COLON 0xfe55
|
| 138 |
+
‰ 2 PER MILLE SIGN 0x2030
|
| 139 |
+
・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
|
| 140 |
+
( 98504 LEFT PARENTHESIS 0x28
|
| 141 |
+
[ 27245 LEFT SQUARE BRACKET 0x5b
|
| 142 |
+
⌊ 1567 LEFT FLOOR 0x230a
|
| 143 |
+
〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
|
| 144 |
+
【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
|
| 145 |
+
﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
|
| 146 |
+
_ 4851 LOW LINE 0x5f
|
| 147 |
+
$ 72 DOLLAR SIGN 0x24
|
| 148 |
+
€ 14 EURO SIGN 0x20ac
|
| 149 |
+
£ 2 POUND SIGN 0xa3
|
| 150 |
+
~ 27462 TILDE 0x7e
|
| 151 |
+
= 11450 EQUALS SIGN 0x3d
|
| 152 |
+
| 8430 VERTICAL LINE 0x7c
|
| 153 |
+
− 3971 MINUS SIGN 0x2212
|
| 154 |
+
≫ 1904 MUCH GREATER-THAN 0x226b
|
| 155 |
+
≪ 1903 MUCH LESS-THAN 0x226a
|
| 156 |
+
+ 1450 PLUS SIGN 0x2b
|
| 157 |
+
< 345 FULLWIDTH LESS-THAN SIGN 0xff1c
|
| 158 |
+
> 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
|
| 159 |
+
¬ 5 NOT SIGN 0xac
|
| 160 |
+
× 4 MULTIPLICATION SIGN 0xd7
|
| 161 |
+
→ 2 RIGHTWARDS ARROW 0x2192
|
| 162 |
+
᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
|
| 163 |
+
° 499 DEGREE SIGN 0xb0
|
| 164 |
+
႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
|
| 165 |
+
� 192 REPLACEMENT CHARACTER 0xfffd
|
| 166 |
+
⌟ 54 BOTTOM RIGHT CORNER 0x231f
|
| 167 |
+
⌞ 54 BOTTOM LEFT CORNER 0x231e
|
| 168 |
+
© 2 COPYRIGHT SIGN 0xa9
|
| 169 |
+
40 NARROW NO-BREAK SPACE 0x202f
|
| 170 |
+
1 SIX-PER-EM SPACE 0x2006
|
| 171 |
+
˜ 40261 SMALL TILDE 0x2dc
|
| 172 |
+
^ 6469 CIRCUMFLEX ACCENT 0x5e
|
| 173 |
+
¯ 20 MACRON 0xaf
|
| 174 |
+
ˇ 191442 CARON 0x2c7
|
| 175 |
+
ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
|
| 176 |
+
ـ 9440 ARABIC TATWEEL 0x640
|
| 177 |
+
ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
|
| 178 |
+
ៗ 3310 KHMER SIGN LEK TOO 0x17d7
|
| 179 |
+
々 678 IDEOGRAPHIC ITERATION MARK 0x3005
|
| 180 |
+
ໆ 430 LAO KO LA 0xec6
|
| 181 |
+
ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
|
| 182 |
+
ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
|
| 183 |
+
৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
|
| 184 |
+
⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
|
| 185 |
+
½ 26 VULGAR FRACTION ONE HALF 0xbd
|
| 186 |
+
¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
|
| 187 |
+
⅟ 1 FRACTION NUMERATOR ONE 0x215f
|
| 188 |
+
⁄ 57 FRACTION SLASH 0x2044
|
normalization/text_norm.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
import unicodedata
|
| 4 |
+
|
| 5 |
+
from normalization.norm_config import norm_config
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
|
| 9 |
+
|
| 10 |
+
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
text : The string to be normalized
|
| 14 |
+
iso_code :
|
| 15 |
+
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
normalized_text : the string after all normalization
|
| 19 |
+
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
config = norm_config.get(iso_code, norm_config["*"])
|
| 23 |
+
|
| 24 |
+
for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
|
| 25 |
+
if field not in config:
|
| 26 |
+
config[field] = norm_config["*"][field]
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
text = unicodedata.normalize(config["unicode_norm"], text)
|
| 30 |
+
|
| 31 |
+
# Convert to lower case
|
| 32 |
+
|
| 33 |
+
if config["lower_case"] and lower_case:
|
| 34 |
+
text = text.lower()
|
| 35 |
+
|
| 36 |
+
# brackets
|
| 37 |
+
|
| 38 |
+
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
| 39 |
+
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
| 40 |
+
if remove_brackets:
|
| 41 |
+
text = re.sub(r"\([^\)]*\)", " ", text)
|
| 42 |
+
|
| 43 |
+
# Apply mappings
|
| 44 |
+
|
| 45 |
+
for old, new in config["mapping"].items():
|
| 46 |
+
text = re.sub(old, new, text)
|
| 47 |
+
|
| 48 |
+
# Replace punctutations with space
|
| 49 |
+
|
| 50 |
+
punct_pattern = r"[" + config["punc_set"]
|
| 51 |
+
|
| 52 |
+
punct_pattern += "]"
|
| 53 |
+
|
| 54 |
+
normalized_text = re.sub(punct_pattern, " ", text)
|
| 55 |
+
|
| 56 |
+
# remove characters in delete list
|
| 57 |
+
|
| 58 |
+
delete_patten = r"[" + config["del_set"] + "]"
|
| 59 |
+
|
| 60 |
+
normalized_text = re.sub(delete_patten, "", normalized_text)
|
| 61 |
+
|
| 62 |
+
# Remove words containing only digits
|
| 63 |
+
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
|
| 64 |
+
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
|
| 65 |
+
# The lookaround enables overlapping pattern matches to be replaced
|
| 66 |
+
|
| 67 |
+
if remove_numbers:
|
| 68 |
+
|
| 69 |
+
digits_pattern = "[" + config["digit_set"]
|
| 70 |
+
|
| 71 |
+
digits_pattern += "]+"
|
| 72 |
+
|
| 73 |
+
complete_digit_pattern = (
|
| 74 |
+
r"^"
|
| 75 |
+
+ digits_pattern
|
| 76 |
+
+ "(?=\s)|(?<=\s)"
|
| 77 |
+
+ digits_pattern
|
| 78 |
+
+ "(?=\s)|(?<=\s)"
|
| 79 |
+
+ digits_pattern
|
| 80 |
+
+ "$"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
|
| 84 |
+
|
| 85 |
+
if config["rm_diacritics"]:
|
| 86 |
+
from unidecode import unidecode
|
| 87 |
+
normalized_text = unidecode(normalized_text)
|
| 88 |
+
|
| 89 |
+
# Remove extra spaces
|
| 90 |
+
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
| 91 |
+
|
| 92 |
+
return normalized_text
|
zeroshot.py
CHANGED
|
@@ -9,6 +9,7 @@ import numpy as np
|
|
| 9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
| 10 |
from huggingface_hub import hf_hub_download
|
| 11 |
from torchaudio.models.decoder import ctc_decoder
|
|
|
|
| 12 |
|
| 13 |
uroman_dir = "uroman"
|
| 14 |
assert os.path.exists(uroman_dir)
|
|
@@ -94,6 +95,7 @@ def load_words(filepath):
|
|
| 94 |
with open(filepath) as f:
|
| 95 |
for line in f:
|
| 96 |
line = line.strip().lower()
|
|
|
|
| 97 |
# ignore invalid words.
|
| 98 |
for w in line.split():
|
| 99 |
words.setdefault(w, 0)
|
|
@@ -109,7 +111,7 @@ def process(
|
|
| 109 |
lmscore=None,
|
| 110 |
wscore_usedefault=True,
|
| 111 |
lmscore_usedefault=True,
|
| 112 |
-
reference=None
|
| 113 |
):
|
| 114 |
transcription, logs = "", MY_LOG()
|
| 115 |
if not audio_data or not words_file:
|
|
@@ -169,7 +171,6 @@ def process(
|
|
| 169 |
|
| 170 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
| 171 |
|
| 172 |
-
|
| 173 |
if lm_path is None:
|
| 174 |
yield transcription, logs.add(f"Filtering lexicon....")
|
| 175 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
@@ -219,8 +220,8 @@ def process(
|
|
| 219 |
yield transcription, logs.add(f"[DONE]")
|
| 220 |
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
|
| 225 |
|
| 226 |
# for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
|
|
|
|
| 9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
| 10 |
from huggingface_hub import hf_hub_download
|
| 11 |
from torchaudio.models.decoder import ctc_decoder
|
| 12 |
+
from normalization.text_norm import text_normalize
|
| 13 |
|
| 14 |
uroman_dir = "uroman"
|
| 15 |
assert os.path.exists(uroman_dir)
|
|
|
|
| 95 |
with open(filepath) as f:
|
| 96 |
for line in f:
|
| 97 |
line = line.strip().lower()
|
| 98 |
+
line = text_normalize(line, iso_code="xxx")
|
| 99 |
# ignore invalid words.
|
| 100 |
for w in line.split():
|
| 101 |
words.setdefault(w, 0)
|
|
|
|
| 111 |
lmscore=None,
|
| 112 |
wscore_usedefault=True,
|
| 113 |
lmscore_usedefault=True,
|
| 114 |
+
reference=None,
|
| 115 |
):
|
| 116 |
transcription, logs = "", MY_LOG()
|
| 117 |
if not audio_data or not words_file:
|
|
|
|
| 171 |
|
| 172 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
| 173 |
|
|
|
|
| 174 |
if lm_path is None:
|
| 175 |
yield transcription, logs.add(f"Filtering lexicon....")
|
| 176 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
|
|
| 220 |
yield transcription, logs.add(f"[DONE]")
|
| 221 |
|
| 222 |
|
| 223 |
+
for i in process("upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"):
|
| 224 |
+
print(i)
|
| 225 |
|
| 226 |
|
| 227 |
# for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
|