Spaces:

shubh2014shiv
/

Japanese_NLP

Runtime error

App Files Files Community

shubh2014shiv commited on Nov 21, 2021

Commit

e9cddb1

1 Parent(s): 98af3bf

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -1

app.py CHANGED Viewed

@@ -9,11 +9,49 @@ from st_aggrid.shared import GridUpdateMode
 from transformers import T5Tokenizer, BertForSequenceClassification,AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import numpy as np
 st.set_page_config(layout="wide")
 st.title("Project - Japanese Natural Language Processing (自然言語処理) using Transformers")
 st.sidebar.subheader("自然言語処理 トピック")
-topic = st.sidebar.radio(label="Select the NLP project topics", options=["Sentiment Analysis","Text Summarization"])
 st.write("-" * 5)
 jp_review_text = None
@@ -235,3 +273,89 @@ elif topic == "Text Summarization":
                 unsafe_allow_html=True)
             st.write(summary)

 from transformers import T5Tokenizer, BertForSequenceClassification,AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import numpy as np
+import json
+from transformers import AutoTokenizer, BertTokenizer, AutoModelWithLMHead
+import pytorch_lightning as pl
+from pathlib import Path
+# Defining some functions for caching purpose by streamlit
+class TranslationModel(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+        self.model = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-ja-en", return_dict=True)
+@st.experimental_singleton
+def loadFineTunedJaEn_NMT_Model():
+    save_dest = Path('model')
+    save_dest.mkdir(exist_ok=True)
+    f_checkpoint = Path("model/best-checkpoint.ckpt")
+    if not f_checkpoint.exists():
+        with st.spinner("Downloading model.This may take a while! \n Don't refresh or close this page!"):
+            from GD_download import download_file_from_google_drive
+            download_file_from_google_drive('1CZQKGj9hSqj7kEuJp_jm7bNVXrbcFsgP', f_checkpoint)
+    trained_model = TranslationModel.load_from_checkpoint(f_checkpoint)
+    return trained_model
+@st.experimental_singleton
+def getJpEn_Tokenizers():
+    try:
+        with st.spinner("Downloading English and Japanese Transformer Tokenizers"):
+            ja_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ja-en")
+            en_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    except:
+        st.error("Issue with downloading tokenizers")
+    return ja_tokenizer, en_tokenizer
 st.set_page_config(layout="wide")
 st.title("Project - Japanese Natural Language Processing (自然言語処理) using Transformers")
 st.sidebar.subheader("自然言語処理 トピック")
+topic = st.sidebar.radio(label="Select the NLP project topics", options=["Sentiment Analysis","Text Summarization","Japanese to English Translation"])
 st.write("-" * 5)
 jp_review_text = None
                 unsafe_allow_html=True)
             st.write(summary)
+elif topic == "Japanese to English Translation":
+    st.markdown(
+        "<h2 style='text-align: left; color:#EE82EE; font-size:25px;'><b>Japanese to English translation (for short sentences)<b></h2>",
+        unsafe_allow_html=True)
+    st.markdown(
+        "<h3 style='text-align: center; color:#F63366; font-size:18px;'><b>Business Scene Dialog Japanese-English Corpus<b></h3>",
+        unsafe_allow_html=True)
+    st.write("Below given Japanese-English pair is from 'Business Scene Dialog Corpus' by the University of Tokyo")
+    link = '[Corpus GitHub Link](https://github.com/tsuruoka-lab/BSD)'
+    st.markdown(link, unsafe_allow_html=True)
+    bsd_more_info = st.expander(label="Expand to get more information on data and training report")
+    with bsd_more_info:
+        st.markdown(
+            "<h3 style='text-align: left; color:#F63366; font-size:12px;'><b>Training Dataset<b></h3>",
+            unsafe_allow_html=True)
+        st.write("The corpus has total 20,000 Japanese-English Business Dialog pairs. The fined-tuned Transformer model is validated on 670 Japanese-English Business Dialog pairs")
+        st.markdown(
+            "<h3 style='text-align: left; color:#F63366; font-size:12px;'><b>Training Report<b></h3>",
+            unsafe_allow_html=True)
+        st.write(
+            "The Dashboard for training result on Tensorboard is [here](https://tensorboard.dev/experiment/eWhxt1i2RuaU64krYtORhw/)")
+    with open("./BSD_ja-en_val.json", encoding='utf-8') as f:
+        bsd_sample_data = json.load(f)
+    en, ja = [], []
+    for i in range(len(bsd_sample_data)):
+        for j in range(len(bsd_sample_data[i]['conversation'])):
+            en.append(bsd_sample_data[i]['conversation'][j]['en_sentence'])
+            ja.append(bsd_sample_data[i]['conversation'][j]['ja_sentence'])
+    df = pd.DataFrame.from_dict({'Japanese': ja, 'English': en})
+    gb = GridOptionsBuilder.from_dataframe(df)
+    gb.configure_pagination()
+    gb.configure_selection(selection_mode="single", use_checkbox=True, suppressRowDeselection=False)
+    gridOptions = gb.build()
+    translation_text = AgGrid(df, gridOptions=gridOptions, theme='material',
+                              enable_enterprise_modules=True,
+                              allow_unsafe_jscode=True, update_mode=GridUpdateMode.SELECTION_CHANGED)
+    if len(translation_text['selected_rows']) != 0:
+        bsd_jp = translation_text['selected_rows'][0]['Japanese']
+        st.markdown(
+            "<h2 style='text-align: left; color:#32CD32; font-size:25px;'><b>Business Scene Dialog in Japanese (日本語でのビジネスシーンダイアログ)<b></h2>",
+            unsafe_allow_html=True)
+        st.write(bsd_jp)
+        if st.button("Translate"):
+            ja_tokenizer, en_tokenizer = getJpEn_Tokenizers()
+            trained_model = loadFineTunedJaEn_NMT_Model()
+            trained_model.freeze()
+            def translate(text):
+                text_encoding = ja_tokenizer(
+                    text,
+                    max_length=100,
+                    padding="max_length",
+                    truncation=True,
+                    return_attention_mask=True,
+                    add_special_tokens=True,
+                    return_tensors='pt'
+                )
+                generated_ids = trained_model.model.generate(
+                    input_ids=text_encoding['input_ids'],
+                    attention_mask=text_encoding['attention_mask'],
+                    max_length=100,
+                    num_beams=2,
+                    repetition_penalty=2.5,
+                    length_penalty=1.0,
+                    early_stopping=True
+                )
+                preds = [en_tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for
+                         gen_id in generated_ids]
+                return "".join(preds)[5:]
+            st.markdown(
+                "<h2 style='text-align: left; color:#32CD32; font-size:25px;'><b>Translated Dialog in English (英語の翻訳されたダイアログ)<b></h2>",
+                unsafe_allow_html=True)
+            st.write(translate(bsd_jp))