lucas066001 commited on
Commit
90cfe35
·
1 Parent(s): 592ce8f

style: Formatting files with Black formatter

Browse files
app/travel_resolver/libs/nlp/langage_detection/extractor.py CHANGED
@@ -6,18 +6,18 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
6
 
7
  def extract_data_from_csv(f_in: str, f_out: str):
8
  """
9
- Take a csv file containing strings and convert it
10
- into a csv file containig letter frequencies infos.
11
 
12
- Args:
13
- f_in (str): File path to analyse, must contain extension.
14
- f_out (str): File path containing result, must contain extension.
15
  """
16
 
17
- with open(f_in, 'r') as csv_file:
18
  csv_reader = csv.reader(csv_file)
19
 
20
- with open(f_out, 'w', newline='') as output_csv:
21
  csv_writer = csv.writer(output_csv)
22
 
23
  for row in csv_reader:
@@ -28,14 +28,14 @@ def extract_data_from_csv(f_in: str, f_out: str):
28
 
29
  def extract_data_from_string(str_in: str) -> List:
30
  """
31
- Retreive tab containing letter frequency informations
32
- and special char frequency of a given string.
33
 
34
- Args:
35
- str_in (str): String to analyse.
36
 
37
- Returns:
38
- (List): Tab containing special char and alphabetical frequencies.
39
  """
40
  str_data = []
41
  str_data = str_data + frequence_letters(str_in)
@@ -45,43 +45,45 @@ def extract_data_from_string(str_in: str) -> List:
45
 
46
  def frequence_letters(str_in: str) -> List:
47
  """
48
- Retreive tab containing letter frequency informations
49
- of a given string.
50
 
51
- Args:
52
- str_in (str): String to analyse.
53
 
54
- Returns:
55
- (List): Tab containing alphabetical char frequencies.
56
  """
57
  counter = Counter(str_in.lower())
58
- freq_tab = [round(counter.get(chr(i), 0) / len(counter) * 100, 2)
59
- for i in range(97, 123)]
 
60
  return freq_tab
61
 
62
 
63
  def frequence_char_part(str_in: str) -> List:
64
  """
65
- Retreive tab containing special char frequency
66
- informations of a given string.
67
 
68
- Args:
69
- str_in (str): String to analyse.
70
 
71
- Returns:
72
- (List): Tab containing special char char frequencies.
73
  """
74
 
75
  counter = Counter(str_in.lower())
76
- freq_tab = [round(counter.get(char, 0) / len(str_in) * 100, 2)
77
- for char in var.SPECIAL_CHARS]
 
78
  return freq_tab
79
 
80
 
81
  def main():
82
  for lang in var.TRAD_TARGETS:
83
- input_file = '../../assets/data/prompts/csv/'+lang+'_prompts.csv'
84
- output_csv_file = '../../assets/data/trainset/'+lang+'_trainset.csv'
85
  extract_data_from_csv(input_file, output_csv_file)
86
 
87
 
 
6
 
7
  def extract_data_from_csv(f_in: str, f_out: str):
8
  """
9
+ Take a csv file containing strings and convert it
10
+ into a csv file containig letter frequencies infos.
11
 
12
+ Args:
13
+ f_in (str): File path to analyse, must contain extension.
14
+ f_out (str): File path containing result, must contain extension.
15
  """
16
 
17
+ with open(f_in, "r") as csv_file:
18
  csv_reader = csv.reader(csv_file)
19
 
20
+ with open(f_out, "w", newline="") as output_csv:
21
  csv_writer = csv.writer(output_csv)
22
 
23
  for row in csv_reader:
 
28
 
29
  def extract_data_from_string(str_in: str) -> List:
30
  """
31
+ Retreive tab containing letter frequency informations
32
+ and special char frequency of a given string.
33
 
34
+ Args:
35
+ str_in (str): String to analyse.
36
 
37
+ Returns:
38
+ (List): Tab containing special char and alphabetical frequencies.
39
  """
40
  str_data = []
41
  str_data = str_data + frequence_letters(str_in)
 
45
 
46
  def frequence_letters(str_in: str) -> List:
47
  """
48
+ Retreive tab containing letter frequency informations
49
+ of a given string.
50
 
51
+ Args:
52
+ str_in (str): String to analyse.
53
 
54
+ Returns:
55
+ (List): Tab containing alphabetical char frequencies.
56
  """
57
  counter = Counter(str_in.lower())
58
+ freq_tab = [
59
+ round(counter.get(chr(i), 0) / len(counter) * 100, 2) for i in range(97, 123)
60
+ ]
61
  return freq_tab
62
 
63
 
64
  def frequence_char_part(str_in: str) -> List:
65
  """
66
+ Retreive tab containing special char frequency
67
+ informations of a given string.
68
 
69
+ Args:
70
+ str_in (str): String to analyse.
71
 
72
+ Returns:
73
+ (List): Tab containing special char char frequencies.
74
  """
75
 
76
  counter = Counter(str_in.lower())
77
+ freq_tab = [
78
+ round(counter.get(char, 0) / len(str_in) * 100, 2) for char in var.SPECIAL_CHARS
79
+ ]
80
  return freq_tab
81
 
82
 
83
  def main():
84
  for lang in var.TRAD_TARGETS:
85
+ input_file = "../../assets/data/prompts/csv/" + lang + "_prompts.csv"
86
+ output_csv_file = "../../assets/data/trainset/" + lang + "_trainset.csv"
87
  extract_data_from_csv(input_file, output_csv_file)
88
 
89
 
app/travel_resolver/libs/nlp/langage_detection/traducer.py CHANGED
@@ -6,37 +6,37 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
6
 
7
  def traduce_into_csv(f_in: str, f_out: str, target_lang: str):
8
  """
9
- Take an input file that contains french text
10
- and translate it into a csv file.
11
 
12
- Args:
13
- f_in (str): File path to analyse, must contain extension.
14
- f_out (str): File path containing result, must contain extension.
15
- target_lang (str): Key representing output langage.
16
  """
17
 
18
  translator = deepl.Translator(os.getenv(var.ENV_AUTH_KEY))
19
 
20
- with open(f_in, 'r') as csv_file:
21
  csv_reader = csv.reader(csv_file)
22
 
23
- with open(f_out, 'w', newline='') as output_csv:
24
  csv_writer = csv.writer(output_csv)
25
  for row in csv_reader:
26
  str = "".join(row).lower()
27
 
28
- str = translator.translate_text(str,
29
- target_lang=target_lang,
30
- source_lang=var.FR)
31
  modified_row = [str]
32
  csv_writer.writerow(modified_row)
33
 
34
 
35
  def main():
36
  for lang in var.TRAD_TARGETS:
37
- source = '../../../../data/langage_detection/prompts/FR_prompts.csv'
38
- output_csv_file = '../../../../data/langage_detection/'
39
- output_csv_file += lang+'_prompts.csv'
40
 
41
  traduce_into_csv(source, output_csv_file, lang)
42
 
 
6
 
7
  def traduce_into_csv(f_in: str, f_out: str, target_lang: str):
8
  """
9
+ Take an input file that contains french text
10
+ and translate it into a csv file.
11
 
12
+ Args:
13
+ f_in (str): File path to analyse, must contain extension.
14
+ f_out (str): File path containing result, must contain extension.
15
+ target_lang (str): Key representing output langage.
16
  """
17
 
18
  translator = deepl.Translator(os.getenv(var.ENV_AUTH_KEY))
19
 
20
+ with open(f_in, "r") as csv_file:
21
  csv_reader = csv.reader(csv_file)
22
 
23
+ with open(f_out, "w", newline="") as output_csv:
24
  csv_writer = csv.writer(output_csv)
25
  for row in csv_reader:
26
  str = "".join(row).lower()
27
 
28
+ str = translator.translate_text(
29
+ str, target_lang=target_lang, source_lang=var.FR
30
+ )
31
  modified_row = [str]
32
  csv_writer.writerow(modified_row)
33
 
34
 
35
  def main():
36
  for lang in var.TRAD_TARGETS:
37
+ source = "../../../../data/langage_detection/prompts/FR_prompts.csv"
38
+ output_csv_file = "../../../../data/langage_detection/"
39
+ output_csv_file += lang + "_prompts.csv"
40
 
41
  traduce_into_csv(source, output_csv_file, lang)
42
 
app/travel_resolver/libs/nlp/langage_detection/trainer.py CHANGED
@@ -9,15 +9,15 @@ import travel_resolver.libs.nlp.langage_detection.variables as var
9
 
10
  def read_data():
11
  """
12
- Retreive and format data from csv input files
13
  """
14
  x, y = [], []
15
  i = 1
16
  for lang in var.CORRESP_LANG:
17
  first = True
18
  current_file = "../../../../data/langage_detection/trainset/"
19
- current_file += lang+"_trainset.csv"
20
- with open(current_file, 'r') as csv_file:
21
  csv_reader = csv.reader(csv_file)
22
  for row in csv_reader:
23
  if not first:
@@ -31,7 +31,7 @@ def read_data():
31
 
32
  def train():
33
  """
34
- Train the model and generate a backup.
35
  """
36
  x_train, x_test, y_train, y_test = read_data()
37
 
 
9
 
10
  def read_data():
11
  """
12
+ Retreive and format data from csv input files
13
  """
14
  x, y = [], []
15
  i = 1
16
  for lang in var.CORRESP_LANG:
17
  first = True
18
  current_file = "../../../../data/langage_detection/trainset/"
19
+ current_file += lang + "_trainset.csv"
20
+ with open(current_file, "r") as csv_file:
21
  csv_reader = csv.reader(csv_file)
22
  for row in csv_reader:
23
  if not first:
 
31
 
32
  def train():
33
  """
34
+ Train the model and generate a backup.
35
  """
36
  x_train, x_test, y_train, y_test = read_data()
37
 
app/travel_resolver/libs/nlp/langage_detection/variables.py CHANGED
@@ -12,19 +12,6 @@ ES = "ES"
12
  PT = "PT-PT"
13
  DE = "DE"
14
 
15
- TRAD_TARGETS = [
16
- EN,
17
- IT,
18
- ES,
19
- PT,
20
- DE
21
- ]
22
 
23
- CORRESP_LANG = [
24
- "FR",
25
- "EN-GB",
26
- "IT",
27
- "ES",
28
- "PT-PT",
29
- "DE"
30
- ]
 
12
  PT = "PT-PT"
13
  DE = "DE"
14
 
15
+ TRAD_TARGETS = [EN, IT, ES, PT, DE]
 
 
 
 
 
 
16
 
17
+ CORRESP_LANG = ["FR", "EN-GB", "IT", "ES", "PT-PT", "DE"]
 
 
 
 
 
 
 
data/scripting_lcs_1/script.py CHANGED
@@ -7,20 +7,20 @@ from typing import List
7
 
8
  def make_unique_lignes(f_in: str, f_out: str) -> int:
9
  """
10
- Delete all duplicate lignes of a file.
11
 
12
- Args:
13
- f_in (str): File path to analyse, must contain extension.
14
- f_out (str): File path containing result, must contain extension.
15
 
16
- Returns:
17
- (int): The number of duplicate lignes found.
18
  """
19
 
20
  seen_lignes: set = set()
21
  duplicates: int = 0
22
 
23
- with open(f_in, 'r') as in_f, open(f_out, 'w') as out_f:
24
  for ligne in in_f:
25
  if ligne not in seen_lignes:
26
  out_f.write(ligne)
@@ -33,51 +33,51 @@ def make_unique_lignes(f_in: str, f_out: str) -> int:
33
 
34
  def count_file_lignes(f_path: str) -> int:
35
  """
36
- Count the number of lines in a file.
37
 
38
- Args:
39
- f_path (str): File path to analyse, must contain extension.
40
 
41
- Returns:
42
- (int): The number of lignes found.
43
  """
44
 
45
- with open(f_path, 'r') as f:
46
  lignes = f.readlines()
47
  return len(lignes)
48
 
49
 
50
  def get_cities() -> List:
51
  """
52
- Returns all cities from sncf db_file.
53
 
54
- Returns:
55
- (List): All cities present in file.
56
  """
57
  villes = []
58
- with open("../sncf_stations_database.csv", 'r') as csvfile:
59
  reader = csv.DictReader(csvfile, delimiter=";")
60
  for row in reader:
61
- villes.append(row['COMMUNE'])
62
  return villes
63
 
64
 
65
  def generate_data(cities: List, file_out: str):
66
  """
67
- Generate dataset from template file.
68
 
69
- Args:
70
- cities (List): Cities from wich combinaison will generate.
71
- file_out (str): Output file, must contain extension.
72
  """
73
 
74
  used_comp = set()
75
  cities = get_cities()
76
 
77
- with open("data_unique_tmp.txt", 'r') as f_template:
78
  template_ligne = f_template.readlines()
79
 
80
- with open(file_out, 'w') as f_sortie:
81
  while len(used_comp) < 75000:
82
 
83
  arrival_city = random.choice(cities)
 
7
 
8
  def make_unique_lignes(f_in: str, f_out: str) -> int:
9
  """
10
+ Delete all duplicate lignes of a file.
11
 
12
+ Args:
13
+ f_in (str): File path to analyse, must contain extension.
14
+ f_out (str): File path containing result, must contain extension.
15
 
16
+ Returns:
17
+ (int): The number of duplicate lignes found.
18
  """
19
 
20
  seen_lignes: set = set()
21
  duplicates: int = 0
22
 
23
+ with open(f_in, "r") as in_f, open(f_out, "w") as out_f:
24
  for ligne in in_f:
25
  if ligne not in seen_lignes:
26
  out_f.write(ligne)
 
33
 
34
  def count_file_lignes(f_path: str) -> int:
35
  """
36
+ Count the number of lines in a file.
37
 
38
+ Args:
39
+ f_path (str): File path to analyse, must contain extension.
40
 
41
+ Returns:
42
+ (int): The number of lignes found.
43
  """
44
 
45
+ with open(f_path, "r") as f:
46
  lignes = f.readlines()
47
  return len(lignes)
48
 
49
 
50
  def get_cities() -> List:
51
  """
52
+ Returns all cities from sncf db_file.
53
 
54
+ Returns:
55
+ (List): All cities present in file.
56
  """
57
  villes = []
58
+ with open("../sncf_stations_database.csv", "r") as csvfile:
59
  reader = csv.DictReader(csvfile, delimiter=";")
60
  for row in reader:
61
+ villes.append(row["COMMUNE"])
62
  return villes
63
 
64
 
65
  def generate_data(cities: List, file_out: str):
66
  """
67
+ Generate dataset from template file.
68
 
69
+ Args:
70
+ cities (List): Cities from wich combinaison will generate.
71
+ file_out (str): Output file, must contain extension.
72
  """
73
 
74
  used_comp = set()
75
  cities = get_cities()
76
 
77
+ with open("data_unique_tmp.txt", "r") as f_template:
78
  template_ligne = f_template.readlines()
79
 
80
+ with open(file_out, "w") as f_sortie:
81
  while len(used_comp) < 75000:
82
 
83
  arrival_city = random.choice(cities)