Spaces:

Az-r-ow
/

TravelNER

Sleeping

App Files Files Community

lucas066001 commited on Sep 13, 2024

Commit

90cfe35

1 Parent(s): 592ce8f

style: Formatting files with Black formatter

Browse files

Files changed (5) hide show

app/travel_resolver/libs/nlp/langage_detection/extractor.py +33 -31
app/travel_resolver/libs/nlp/langage_detection/traducer.py +14 -14
app/travel_resolver/libs/nlp/langage_detection/trainer.py +4 -4
app/travel_resolver/libs/nlp/langage_detection/variables.py +2 -15
data/scripting_lcs_1/script.py +24 -24

app/travel_resolver/libs/nlp/langage_detection/extractor.py CHANGED Viewed

@@ -6,18 +6,18 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
 def extract_data_from_csv(f_in: str, f_out: str):
     """
-        Take a csv file containing strings and convert it
-        into a csv file containig letter frequencies infos.
-        Args:
-            f_in (str): File path to analyse, must contain extension.
-            f_out (str): File path containing result, must contain extension.
     """
-    with open(f_in, 'r') as csv_file:
         csv_reader = csv.reader(csv_file)
-        with open(f_out, 'w', newline='') as output_csv:
             csv_writer = csv.writer(output_csv)
             for row in csv_reader:
@@ -28,14 +28,14 @@ def extract_data_from_csv(f_in: str, f_out: str):
 def extract_data_from_string(str_in: str) -> List:
     """
-        Retreive tab containing letter frequency informations
-        and special char frequency of a given string.
-        Args:
-            str_in (str): String to analyse.
-        Returns:
-            (List): Tab containing special char and alphabetical frequencies.
     """
     str_data = []
     str_data = str_data + frequence_letters(str_in)
@@ -45,43 +45,45 @@ def extract_data_from_string(str_in: str) -> List:
 def frequence_letters(str_in: str) -> List:
     """
-        Retreive tab containing letter frequency informations
-        of a given string.
-        Args:
-            str_in (str): String to analyse.
-        Returns:
-            (List): Tab containing alphabetical char frequencies.
     """
     counter = Counter(str_in.lower())
-    freq_tab = [round(counter.get(chr(i), 0) / len(counter) * 100, 2)
-                for i in range(97, 123)]
     return freq_tab
 def frequence_char_part(str_in: str) -> List:
     """
-        Retreive tab containing special char frequency
-        informations of a given string.
-        Args:
-            str_in (str): String to analyse.
-        Returns:
-            (List): Tab containing special char char frequencies.
     """
     counter = Counter(str_in.lower())
-    freq_tab = [round(counter.get(char, 0) / len(str_in) * 100, 2)
-                for char in var.SPECIAL_CHARS]
     return freq_tab
 def main():
     for lang in var.TRAD_TARGETS:
-        input_file = '../../assets/data/prompts/csv/'+lang+'_prompts.csv'
-        output_csv_file = '../../assets/data/trainset/'+lang+'_trainset.csv'
         extract_data_from_csv(input_file, output_csv_file)

 def extract_data_from_csv(f_in: str, f_out: str):
     """
+    Take a csv file containing strings and convert it
+    into a csv file containig letter frequencies infos.
+    Args:
+        f_in (str): File path to analyse, must contain extension.
+        f_out (str): File path containing result, must contain extension.
     """
+    with open(f_in, "r") as csv_file:
         csv_reader = csv.reader(csv_file)
+        with open(f_out, "w", newline="") as output_csv:
             csv_writer = csv.writer(output_csv)
             for row in csv_reader:
 def extract_data_from_string(str_in: str) -> List:
     """
+    Retreive tab containing letter frequency informations
+    and special char frequency of a given string.
+    Args:
+        str_in (str): String to analyse.
+    Returns:
+        (List): Tab containing special char and alphabetical frequencies.
     """
     str_data = []
     str_data = str_data + frequence_letters(str_in)
 def frequence_letters(str_in: str) -> List:
     """
+    Retreive tab containing letter frequency informations
+    of a given string.
+    Args:
+        str_in (str): String to analyse.
+    Returns:
+        (List): Tab containing alphabetical char frequencies.
     """
     counter = Counter(str_in.lower())
+    freq_tab = [
+        round(counter.get(chr(i), 0) / len(counter) * 100, 2) for i in range(97, 123)
+    ]
     return freq_tab
 def frequence_char_part(str_in: str) -> List:
     """
+    Retreive tab containing special char frequency
+    informations of a given string.
+    Args:
+        str_in (str): String to analyse.
+    Returns:
+        (List): Tab containing special char char frequencies.
     """
     counter = Counter(str_in.lower())
+    freq_tab = [
+        round(counter.get(char, 0) / len(str_in) * 100, 2) for char in var.SPECIAL_CHARS
+    ]
     return freq_tab
 def main():
     for lang in var.TRAD_TARGETS:
+        input_file = "../../assets/data/prompts/csv/" + lang + "_prompts.csv"
+        output_csv_file = "../../assets/data/trainset/" + lang + "_trainset.csv"
         extract_data_from_csv(input_file, output_csv_file)

app/travel_resolver/libs/nlp/langage_detection/traducer.py CHANGED Viewed

@@ -6,37 +6,37 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
 def traduce_into_csv(f_in: str, f_out: str, target_lang: str):
     """
-        Take an input file that contains french text
-        and translate it into a csv file.
-        Args:
-            f_in (str): File path to analyse, must contain extension.
-            f_out (str): File path containing result, must contain extension.
-            target_lang (str): Key representing output langage.
     """
     translator = deepl.Translator(os.getenv(var.ENV_AUTH_KEY))
-    with open(f_in, 'r') as csv_file:
         csv_reader = csv.reader(csv_file)
-        with open(f_out, 'w', newline='') as output_csv:
             csv_writer = csv.writer(output_csv)
             for row in csv_reader:
                 str = "".join(row).lower()
-                str = translator.translate_text(str,
-                                                target_lang=target_lang,
-                                                source_lang=var.FR)
                 modified_row = [str]
                 csv_writer.writerow(modified_row)
 def main():
     for lang in var.TRAD_TARGETS:
-        source = '../../../../data/langage_detection/prompts/FR_prompts.csv'
-        output_csv_file = '../../../../data/langage_detection/'
-        output_csv_file += lang+'_prompts.csv'
         traduce_into_csv(source, output_csv_file, lang)

 def traduce_into_csv(f_in: str, f_out: str, target_lang: str):
     """
+    Take an input file that contains french text
+    and translate it into a csv file.
+    Args:
+        f_in (str): File path to analyse, must contain extension.
+        f_out (str): File path containing result, must contain extension.
+        target_lang (str): Key representing output langage.
     """
     translator = deepl.Translator(os.getenv(var.ENV_AUTH_KEY))
+    with open(f_in, "r") as csv_file:
         csv_reader = csv.reader(csv_file)
+        with open(f_out, "w", newline="") as output_csv:
             csv_writer = csv.writer(output_csv)
             for row in csv_reader:
                 str = "".join(row).lower()
+                str = translator.translate_text(
+                    str, target_lang=target_lang, source_lang=var.FR
+                )
                 modified_row = [str]
                 csv_writer.writerow(modified_row)
 def main():
     for lang in var.TRAD_TARGETS:
+        source = "../../../../data/langage_detection/prompts/FR_prompts.csv"
+        output_csv_file = "../../../../data/langage_detection/"
+        output_csv_file += lang + "_prompts.csv"
         traduce_into_csv(source, output_csv_file, lang)

app/travel_resolver/libs/nlp/langage_detection/trainer.py CHANGED Viewed

@@ -9,15 +9,15 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
 def read_data():
     """
-        Retreive and format data from csv input files
     """
     x, y = [], []
     i = 1
     for lang in var.CORRESP_LANG:
         first = True
         current_file = "../../../../data/langage_detection/trainset/"
-        current_file += lang+"_trainset.csv"
-        with open(current_file, 'r') as csv_file:
             csv_reader = csv.reader(csv_file)
             for row in csv_reader:
                 if not first:
@@ -31,7 +31,7 @@ def read_data():
 def train():
     """
-        Train the model and generate a backup.
     """
     x_train, x_test, y_train, y_test = read_data()

 def read_data():
     """
+    Retreive and format data from csv input files
     """
     x, y = [], []
     i = 1
     for lang in var.CORRESP_LANG:
         first = True
         current_file = "../../../../data/langage_detection/trainset/"
+        current_file += lang + "_trainset.csv"
+        with open(current_file, "r") as csv_file:
             csv_reader = csv.reader(csv_file)
             for row in csv_reader:
                 if not first:
 def train():
     """
+    Train the model and generate a backup.
     """
     x_train, x_test, y_train, y_test = read_data()

app/travel_resolver/libs/nlp/langage_detection/variables.py CHANGED Viewed

@@ -12,19 +12,6 @@ ES = "ES"
 PT = "PT-PT"
 DE = "DE"
-TRAD_TARGETS = [
-    EN,
-    IT,
-    ES,
-    PT,
-    DE
-]
-CORRESP_LANG = [
-    "FR",
-    "EN-GB",
-    "IT",
-    "ES",
-    "PT-PT",
-    "DE"
-]

 PT = "PT-PT"
 DE = "DE"
+TRAD_TARGETS = [EN, IT, ES, PT, DE]
+CORRESP_LANG = ["FR", "EN-GB", "IT", "ES", "PT-PT", "DE"]

data/scripting_lcs_1/script.py CHANGED Viewed

@@ -7,20 +7,20 @@ from typing import List
 def make_unique_lignes(f_in: str, f_out: str) -> int:
     """
-        Delete all duplicate lignes of a file.
-        Args:
-            f_in (str): File path to analyse, must contain extension.
-            f_out (str): File path containing result, must contain extension.
-        Returns:
-            (int): The number of duplicate lignes found.
     """
     seen_lignes: set = set()
     duplicates: int = 0
-    with open(f_in, 'r') as in_f, open(f_out, 'w') as out_f:
         for ligne in in_f:
             if ligne not in seen_lignes:
                 out_f.write(ligne)
@@ -33,51 +33,51 @@ def make_unique_lignes(f_in: str, f_out: str) -> int:
 def count_file_lignes(f_path: str) -> int:
     """
-        Count the number of lines in a file.
-        Args:
-            f_path (str): File path to analyse, must contain extension.
-        Returns:
-            (int): The number of lignes found.
     """
-    with open(f_path, 'r') as f:
         lignes = f.readlines()
         return len(lignes)
 def get_cities() -> List:
     """
-        Returns all cities from sncf db_file.
-        Returns:
-            (List): All cities present in file.
     """
     villes = []
-    with open("../sncf_stations_database.csv", 'r') as csvfile:
         reader = csv.DictReader(csvfile, delimiter=";")
         for row in reader:
-            villes.append(row['COMMUNE'])
     return villes
 def generate_data(cities: List, file_out: str):
     """
-        Generate dataset from template file.
-        Args:
-            cities (List): Cities from wich combinaison will generate.
-            file_out (str): Output file, must contain extension.
     """
     used_comp = set()
     cities = get_cities()
-    with open("data_unique_tmp.txt", 'r') as f_template:
         template_ligne = f_template.readlines()
-    with open(file_out, 'w') as f_sortie:
         while len(used_comp) < 75000:
             arrival_city = random.choice(cities)

 def make_unique_lignes(f_in: str, f_out: str) -> int:
     """
+    Delete all duplicate lignes of a file.
+    Args:
+        f_in (str): File path to analyse, must contain extension.
+        f_out (str): File path containing result, must contain extension.
+    Returns:
+        (int): The number of duplicate lignes found.
     """
     seen_lignes: set = set()
     duplicates: int = 0
+    with open(f_in, "r") as in_f, open(f_out, "w") as out_f:
         for ligne in in_f:
             if ligne not in seen_lignes:
                 out_f.write(ligne)
 def count_file_lignes(f_path: str) -> int:
     """
+    Count the number of lines in a file.
+    Args:
+        f_path (str): File path to analyse, must contain extension.
+    Returns:
+        (int): The number of lignes found.
     """
+    with open(f_path, "r") as f:
         lignes = f.readlines()
         return len(lignes)
 def get_cities() -> List:
     """
+    Returns all cities from sncf db_file.
+    Returns:
+        (List): All cities present in file.
     """
     villes = []
+    with open("../sncf_stations_database.csv", "r") as csvfile:
         reader = csv.DictReader(csvfile, delimiter=";")
         for row in reader:
+            villes.append(row["COMMUNE"])
     return villes
 def generate_data(cities: List, file_out: str):
     """
+    Generate dataset from template file.
+    Args:
+        cities (List): Cities from wich combinaison will generate.
+        file_out (str): Output file, must contain extension.
     """
     used_comp = set()
     cities = get_cities()
+    with open("data_unique_tmp.txt", "r") as f_template:
         template_ligne = f_template.readlines()
+    with open(file_out, "w") as f_sortie:
         while len(used_comp) < 75000:
             arrival_city = random.choice(cities)