David Dale commited on
Commit
8d96c36
·
1 Parent(s): eafce3d

Add a data browser

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +5 -0
  3. data_samples.py +74 -0
  4. requirements.txt +1 -0
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 💐
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.14.0
8
  app_file: app.py
9
  pinned: false
10
  license: other
 
4
  colorFrom: green
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
  license: other
app.py CHANGED
@@ -11,6 +11,7 @@ import pandas as pd
11
  from collections import defaultdict
12
 
13
  from leaderboard import leaderboard_tab
 
14
 
15
  DLA = """
16
  Note that the dataset collected by the BOUQuET initiative and your contributions to this dataset will be released under the following open source license.
@@ -243,6 +244,9 @@ with gr.Blocks(
243
  css="""
244
  #cla textarea {min-height: 60em;}
245
  main.app {max-width: 90em; margin: auto;}
 
 
 
246
  """,
247
  theme=gr.themes.Glass(
248
  font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]
@@ -256,6 +260,7 @@ with gr.Blocks(
256
  """)
257
  intro_tab()
258
  leaderboard_tab()
 
259
  guidelines_tab()
260
  dla_tab()
261
 
 
11
  from collections import defaultdict
12
 
13
  from leaderboard import leaderboard_tab
14
+ from data_samples import data_browse_tab
15
 
16
  DLA = """
17
  Note that the dataset collected by the BOUQuET initiative and your contributions to this dataset will be released under the following open source license.
 
244
  css="""
245
  #cla textarea {min-height: 60em;}
246
  main.app {max-width: 90em; margin: auto;}
247
+ .small-font {
248
+ font-size: 0.8em;
249
+ }
250
  """,
251
  theme=gr.themes.Glass(
252
  font=[gr.themes.GoogleFont("Roboto"), "Arial", "sans-serif"]
 
260
  """)
261
  intro_tab()
262
  leaderboard_tab()
263
+ data_browse_tab()
264
  guidelines_tab()
265
  dla_tab()
266
 
data_samples.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import datasets
3
+ import pandas as pd
4
+ import gradio as gr
5
+ import csv
6
+ from collections import defaultdict
7
+ import random
8
+
9
+ INTRO = """
10
+ The table below demonstrates a sample paragraph from the dev split of BOUQuET.
11
+ """
12
+ ALL = "All"
13
+
14
+
15
+ def data_browse_tab():
16
+ # Load the data
17
+ ds = datasets.load_dataset("facebook/bouquet", "sentence_level", split="dev")
18
+ long_df = ds.to_pandas()
19
+ lang2df = {
20
+ lang: part.drop(columns=["tgt_text", "tgt_lang"]).reset_index(drop=True)
21
+ for lang, part in long_df.groupby('src_lang')
22
+ }
23
+ eng_df = lang2df["eng_Latn"]
24
+ langs = sorted(lang2df.keys())
25
+ domains = sorted(set(eng_df["domain"]))
26
+ paragraph_ids = sorted(set(eng_df["par_id"]))
27
+ domain2par_ids = {domain: sorted(set(group["par_id"])) for domain, group in eng_df.groupby("domain")}
28
+
29
+ def select_data(src_lang, tgt_lang, par_id):
30
+ src_df = lang2df[src_lang]
31
+ tgt_df = lang2df[tgt_lang]
32
+ df = src_df.copy()
33
+ df["tgt_text"] = tgt_df["src_text"]
34
+ par = df[df['par_id'].eq(par_id)].copy()
35
+ part = par[['domain', 'uniq_id', 'orig_text', 'src_text', 'tgt_text', 'tags', 'register']]
36
+ # TODO: add 'par_comment' in a text field below
37
+ return gr.update(value=part, wrap=True)
38
+
39
+ with gr.Tab("Data samples"):
40
+ gr.Markdown("# BOUQuET data browser")
41
+ # Define the controls
42
+ with gr.Row():
43
+ gr_src_lang = gr.Dropdown(langs, label="Source lang", value=random.choice(langs))
44
+ gr_tgt_lang = gr.Dropdown(langs, label="Target lang", value=random.choice(langs))
45
+ gr_domain = gr.Dropdown([ALL] + domains, label="Domain", value=ALL)
46
+ gr_par_id = gr.Dropdown(paragraph_ids, label="Paragraph ID", value=random.choice(paragraph_ids))
47
+ inputs = [gr_src_lang, gr_tgt_lang, gr_par_id]
48
+ gr_sample_btn = gr.Button(value="Sample a paragraph")
49
+ gr_sample_btn.click(fn=lambda: random.choice(paragraph_ids), inputs=None, outputs=gr_par_id)
50
+
51
+ # Define the data
52
+ df_all = select_data(*[inp.value for inp in inputs])
53
+ gr_df = gr.Dataframe(
54
+ df_all,
55
+ wrap=True,
56
+ show_fullscreen_button=True,
57
+ column_widths=["10%", "5%", "20%", "20%", "20%", "15%", "6%"],
58
+ elem_classes=["small-font"],
59
+ )
60
+ # Interactivity
61
+ for inp in inputs:
62
+ inp.change(fn=select_data, inputs=inputs, outputs=gr_df)
63
+
64
+ def change_domain(domain, par_id):
65
+ if domain == ALL:
66
+ par_ids = paragraph_ids
67
+ else:
68
+ par_ids = domain2par_ids[domain]
69
+ if par_id not in par_ids:
70
+ par_id = random.choice(par_ids)
71
+ print(f"par_id: {par_id} is one of {par_ids}")
72
+ return gr.Dropdown(choices=par_ids, value=par_id)
73
+
74
+ gr_domain.change(fn=change_domain, inputs=[gr_domain, gr_par_id], outputs=[gr_par_id])
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio[oauth]
2
  pandas
3
  matplotlib # for background_gradient
 
 
1
  gradio[oauth]
2
  pandas
3
  matplotlib # for background_gradient
4
+ datasets