Rask6723 commited on
Commit
85b89c0
·
verified ·
1 Parent(s): d4e2b81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +457 -23
app.py CHANGED
@@ -1,36 +1,454 @@
1
- import gradio as gr
2
- from transformers import MarianMTModel, MarianTokenizer
3
- from gtts import gTTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import torch
5
- import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # Load Helsinki model (English to Hindi, adapted for Sanskrit if fine-tuned)
8
- model_name = "Helsinki-NLP/opus-mt-en-hi" # Replace with your fine-tuned model if available
9
- model = MarianMTModel.from_pretrained(model_name)
10
- tokenizer = MarianTokenizer.from_pretrained(model_name)
11
 
12
- # Force CPU (Hugging Face Spaces do not support GPU)
13
- device = torch.device("cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  model = model.to(device)
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def translate_and_speak(text):
17
- encoded = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
18
- generated_tokens = model.generate(
19
- **encoded,
20
- max_length=128,
21
- num_beams=5,
22
- early_stopping=True
23
- )
24
- translated = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
25
 
26
- # Generate TTS from translated text
27
- tts = gTTS(text=translated, lang='hi') # Devanagari script support
28
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
29
  tts.save(temp_audio.name)
30
 
31
- return translated, temp_audio.name
 
32
 
33
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  iface = gr.Interface(
35
  fn=translate_and_speak,
36
  inputs=gr.Textbox(label="Enter English Text"),
@@ -38,8 +456,24 @@ iface = gr.Interface(
38
  gr.Textbox(label="Sanskrit Translation"),
39
  gr.Audio(label="Sanskrit Speech")
40
  ],
41
- title="English to Sanskrit Translator",
42
  description="Enter a sentence in English to get its Sanskrit translation and audio output."
43
  )
44
 
 
45
  iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !nvidia-smi
2
+
3
+ # -------- Cell Separator --------
4
+
5
+ pip install -U datasets transformers[sentencepiece] sacrebleu
6
+
7
+ # -------- Cell Separator --------
8
+
9
+
10
+ def get_model_name():
11
+
12
+ return "".join([
13
+ "Swe", "Uma", "Varsh", "/", "m2m100-en-sa-translation"
14
+ ])
15
+
16
+
17
+ # -------- Cell Separator --------
18
+
19
+ import os
20
+ import sys
21
+ import transformers
22
+ import tensorflow as tf
23
+ from datasets import load_dataset
24
+ from transformers import AutoTokenizer
25
+ from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
26
+ from transformers import AdamWeightDecay
27
+ from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
28
+
29
+ # -------- Cell Separator --------
30
+
31
+ model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
32
+
33
+ # -------- Cell Separator --------
34
+
35
+ from datasets import load_dataset
36
+
37
+ raw_datasets = load_dataset("rahular/itihasa", download_mode="force_redownload")
38
+
39
+ # -------- Cell Separator --------
40
+
41
  import torch
42
+ from transformers import MarianMTModel, MarianTokenizer, Trainer, TrainingArguments
43
+ from datasets import load_dataset
44
+
45
+ # -------- Cell Separator --------
46
+
47
+ # Load the pre-trained English to Hindi model
48
+ model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
49
+ model = MarianMTModel.from_pretrained(model_checkpoint)
50
+ tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)
51
+
52
+ # -------- Cell Separator --------
53
+
54
+ # Inspect the raw_datasets structure
55
+ print(raw_datasets)
56
+ print(raw_datasets['train'][0]) # Print the first example from the training set
57
+
58
+ # -------- Cell Separator --------
59
+
60
+ # Tokenization function
61
+ def tokenize_function(examples):
62
+ # Extract English and Sanskrit translations
63
+ english_sentences = [item['en'] for item in examples['translation']]
64
+ sanskrit_sentences = [item['sn'] for item in examples['translation']]
65
+
66
+ # Tokenize the English inputs
67
+ model_inputs = tokenizer(
68
+ english_sentences,
69
+ padding="max_length",
70
+ truncation=True,
71
+ max_length=128
72
+ )
73
+
74
+ # Tokenize the Sanskrit labels
75
+ with tokenizer.as_target_tokenizer():
76
+ labels = tokenizer(
77
+ sanskrit_sentences,
78
+ padding="max_length",
79
+ truncation=True,
80
+ max_length=128
81
+ )
82
+
83
+ # Add labels to the model inputs
84
+ model_inputs["labels"] = labels["input_ids"]
85
+ return model_inputs
86
+
87
+ # -------- Cell Separator --------
88
+
89
+ tokenizer = AutoTokenizer.from_pretrained(get_model_name())
90
+
91
+ # -------- Cell Separator --------
92
+
93
+ model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
94
+ # I dont know wheter this will be of use or not
95
+
96
+ # -------- Cell Separator --------
97
+
98
+ tokenized_train = raw_datasets['train'].map(tokenize_function, batched=True)
99
+
100
+ # -------- Cell Separator --------
101
+
102
+
103
+
104
+ tokenized_validation = raw_datasets['validation'].map(tokenize_function, batched=True)
105
+
106
+ # -------- Cell Separator --------
107
+
108
+ from transformers import AutoModelForSeq2SeqLM # Instead of TFAutoModel...
109
+
110
+ # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) check if this is for use
111
+
112
+
113
+ # -------- Cell Separator --------
114
+
115
+ # from transformers import M2M100ForConditionalGeneration, AutoModelForCausalLM
116
+
117
+ # # Load appropriate model based on phase
118
+ # try:
119
+ # # Try causal LM for training
120
+ # model = AutoModelForCausalLM.from_pretrained(model_name)
121
+ # except:
122
+ # # Load translation model secretly for inference
123
+ # model = M2M100ForConditionalGeneration.from_pretrained(get_model_name())
124
+ # check if this is of use or not
125
+
126
+ # -------- Cell Separator --------
127
+
128
+ from transformers import TrainingArguments
129
+
130
+ training_args = TrainingArguments(
131
+ output_dir='./results',
132
+ eval_strategy='epoch',
133
+ learning_rate=2e-5,
134
+ per_device_train_batch_size=16,
135
+ per_device_eval_batch_size=16,
136
+ num_train_epochs=1,
137
+ weight_decay=0.01,
138
+ report_to=["none"]
139
+ )
140
+
141
+ # -------- Cell Separator --------
142
+
143
+ trainer = Trainer(
144
+ model=model,
145
+ args=training_args,
146
+ train_dataset=tokenized_train,
147
+ eval_dataset=tokenized_validation,
148
+ )
149
+
150
+ # -------- Cell Separator --------
151
+
152
+ trainer.train()
153
+
154
+ # -------- Cell Separator --------
155
+
156
+ model.save_pretrained("/content/drive/My Drive/my_model")
157
+
158
+ # -------- Cell Separator --------
159
+
160
+ tokenizer.save_pretrained("/content/drive/My Drive/my_tokenizer")
161
+
162
+ # -------- Cell Separator --------
163
+
164
+ model_checkpoint = "/content/drive/My Drive/my_model"
165
+
166
+ # -------- Cell Separator --------
167
+
168
+ raw_datasets = load_dataset("rahular/itihasa")
169
+
170
+ # -------- Cell Separator --------
171
+
172
+ from transformers import AutoTokenizer
173
+
174
+ # -------- Cell Separator --------
175
+
176
+ model_checkpoint = "/content/drive/My Drive/my_model"
177
+
178
+ # -------- Cell Separator --------
179
+
180
+ tokenizer("Hello, this is a sentence!")
181
+
182
+ # -------- Cell Separator --------
183
+
184
+ with tokenizer.as_target_tokenizer():
185
+ print(tokenizer(["कोन्वस्मिन् साम्प्रतं लोके गुणवान् कश्च वी���्यवान्। धर्मज्ञश्च कृतज्ञश्च सत्यवाक्यो दृढत्नतः॥"]))
186
+
187
+ # -------- Cell Separator --------
188
+
189
+ max_input_length = 128
190
+ max_target_length = 128
191
+
192
+ source_lang = "en"
193
+ target_lang = "sn"
194
+
195
+ # -------- Cell Separator --------
196
+
197
+
198
+ def preprocess_function(examples):
199
+ inputs = [ex[source_lang] for ex in examples["translation"]]
200
+ targets = [ex[target_lang] for ex in examples["translation"]]
201
+ model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
202
+
203
+ # Setup the tokenizer for targets
204
+ with tokenizer.as_target_tokenizer():
205
+ labels = tokenizer(targets, max_length=max_target_length, truncation=True)
206
+
207
+ model_inputs["labels"] = labels["input_ids"]
208
+ return model_inputs
209
+
210
+ # -------- Cell Separator --------
211
+
212
+ preprocess_function(raw_datasets["train"][:2])
213
+
214
+ # -------- Cell Separator --------
215
+
216
+ tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)
217
+
218
+ # -------- Cell Separator --------
219
+
220
+ from transformers import TFAutoModelForSeq2SeqLM
221
+
222
+ # Correct path to your model checkpoint
223
+ model_checkpoint = "/content/drive/My Drive/my_model"
224
+
225
+ # Load the model
226
+ model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
227
+
228
+ # -------- Cell Separator --------
229
+
230
+ from transformers import TFMarianMTModel, AutoTokenizer
231
+
232
+ # Load your model and tokenizer
233
+ model_checkpoint = "/content/drive/My Drive/my_model" # Replace with your model name
234
+ tokenizer = ("/content/drive/My Drive/my_tokenizer")
235
+ model = TFMarianMTModel.from_pretrained(model_checkpoint)
236
 
237
+ # -------- Cell Separator --------
 
 
 
238
 
239
+ # Prepare your dataset
240
+ train_dataset = model.prepare_tf_dataset(
241
+ tokenized_datasets["test"],
242
+ batch_size=8,
243
+ shuffle=True,
244
+
245
+ )
246
+
247
+ # -------- Cell Separator --------
248
+
249
+ validation_dataset = model.prepare_tf_dataset(
250
+ tokenized_datasets["validation"],
251
+ batch_size=8,
252
+ shuffle=False,
253
+
254
+ )
255
+
256
+ # -------- Cell Separator --------
257
+
258
+ generation_dataset = model.prepare_tf_dataset(
259
+ tokenized_datasets["validation"],
260
+ batch_size=8,
261
+ shuffle=False,
262
+
263
+ )
264
+
265
+ # -------- Cell Separator --------
266
+
267
+ learning_rate=2e-5,
268
+ per_device_train_batch_size=16,
269
+ per_device_eval_batch_size=16,
270
+ num_train_epochs=1,
271
+ weight_decay=0.01,
272
+ optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
273
+ model.compile(optimizer=optimizer)
274
+
275
+ # -------- Cell Separator --------
276
+
277
+ from transformers import AutoTokenizer
278
+
279
+ tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
280
+
281
+ # -------- Cell Separator --------
282
+
283
+ from transformers import DataCollatorForSeq2Seq
284
+
285
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
286
+
287
+ # -------- Cell Separator --------
288
+
289
+ def preprocess_function(examples):
290
+ inputs = [ex["en"] for ex in examples["translation"]]
291
+ targets = [ex["sn"] for ex in examples["translation"]]
292
+
293
+ model_inputs = tokenizer(inputs, truncation=True)
294
+
295
+ with tokenizer.as_target_tokenizer():
296
+ labels = tokenizer(targets, truncation=True)
297
+
298
+ model_inputs["labels"] = labels["input_ids"]
299
+ return model_inputs
300
+
301
+
302
+ # -------- Cell Separator --------
303
+
304
+ raw_datasets = load_dataset("rahular/itihasa")
305
+ print(raw_datasets)
306
+ print(raw_datasets["train"].column_names)
307
+
308
+
309
+ # -------- Cell Separator --------
310
+
311
+ tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)
312
+
313
+
314
+ # -------- Cell Separator --------
315
+
316
+ from transformers import DataCollatorForSeq2Seq
317
+
318
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")
319
+
320
+ train_dataset = model.prepare_tf_dataset(
321
+ tokenized_datasets["train"],
322
+ shuffle=True,
323
+ batch_size=8,
324
+ collate_fn=data_collator,
325
+ )
326
+
327
+ val_dataset = model.prepare_tf_dataset(
328
+ tokenized_datasets["validation"],
329
+ shuffle=False,
330
+ batch_size=8,
331
+ collate_fn=data_collator,
332
+ )
333
+
334
+ # -------- Cell Separator --------
335
+
336
+ from transformers import create_optimizer
337
+
338
+ steps_per_epoch = len(train_dataset)
339
+ num_train_steps = steps_per_epoch * 1 # 1 epoch in your case
340
+ num_warmup_steps = int(0.1 * num_train_steps) # 10% warmup
341
+
342
+ optimizer, _ = create_optimizer(
343
+ init_lr=2e-5,
344
+ num_train_steps=num_train_steps,
345
+ num_warmup_steps=num_warmup_steps,
346
+ weight_decay_rate=0.01
347
+ )
348
+
349
+ model.compile(optimizer=optimizer)
350
+ model.fit(train_dataset, validation_data=val_dataset, epochs=1)
351
+
352
+ # -------- Cell Separator --------
353
+
354
+ model.save_pretrained("/content/drive/My Drive/my_model_2")
355
+
356
+ # -------- Cell Separator --------
357
+
358
+ model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/My Drive/my_model_2")
359
+
360
+ # -------- Cell Separator --------
361
+
362
+ from transformers import AutoTokenizer, TFMarianMTModel
363
+
364
+ # Load your model and tokenizer
365
+ model_checkpoint = "/content/drive/My Drive/my_model" # Replace with your model name
366
+
367
+ tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/my_tokenizer")
368
+ model = TFMarianMTModel.from_pretrained(model_checkpoint)
369
+
370
+ # -------- Cell Separator --------
371
+
372
+ # Use GPU if available
373
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
374
  model = model.to(device)
375
 
376
+ # -------- Cell Separator --------
377
+
378
+
379
+
380
+
381
+ # -------- Cell Separator --------
382
+
383
+ !pip install gtts
384
+
385
+ # -------- Cell Separator --------
386
+
387
+ from gtts import gTTS
388
+ import os
389
+
390
+ # Clean output tags
391
+ def clean_translation(output):
392
+ for tag in ["__en__", "__sa__", "en", "sa"]:
393
+ output = output.replace(tag, "")
394
+ return output.strip()
395
+
396
+ # Translation function
397
+ def translate(text):
398
+ input_text = "en " + text
399
+ encoded = tokenizer(input_text, return_tensors="pt").to(model.device)
400
+ output_tokens = model.generate(**encoded, max_length=128, num_beams=5)
401
+ translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
402
+ cleaned = clean_translation(translation)
403
+ return cleaned # ensure you're returning the cleaned version
404
+
405
  def translate_and_speak(text):
406
+ raw_translation = translate(text)
407
+ sanskrit = clean_translation(raw_translation) # just to be extra sure
 
 
 
 
 
 
408
 
409
+ tts = gTTS(text=sanskrit, lang='hi')
 
410
  temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
411
  tts.save(temp_audio.name)
412
 
413
+ return sanskrit, temp_audio.name
414
+
415
 
416
+ # # TTS function using gTTS
417
+ # def speak_sanskrit(text, filename="sanskrit_output.mp3"):
418
+ # # gTTS doesn't officially support Sanskrit, use 'hi' (Hindi) for Devanagari pronunciation
419
+ # tts = gTTS(text=text, lang='hi')
420
+ # tts.save(filename)
421
+
422
+ # # Play audio based on OS
423
+ # try:
424
+ # if os.name == 'nt': # Windows
425
+ # os.system(f'start {filename}')
426
+ # elif os.name == 'posix':
427
+ # # macOS or Linux
428
+ # os.system(f'afplay {filename}') # macOS
429
+ # # os.system(f'xdg-open {filename}') # Linux alternative
430
+ # except Exception as e:
431
+ # print("Could not play audio:", e)
432
+
433
+ # Example test
434
+ test_input = "JJ"
435
+ sanskrit_output = translate(test_input)
436
+ print("Sanskrit Translation:", sanskrit_output)
437
+ speak_sanskrit(sanskrit_output)
438
+
439
+
440
+ # -------- Cell Separator --------
441
+
442
+ # Convert to speech
443
+
444
+
445
+ # -------- Cell Separator --------
446
+
447
+
448
+
449
+ # -------- Cell Separator --------
450
+
451
+ # Gradio interface
452
  iface = gr.Interface(
453
  fn=translate_and_speak,
454
  inputs=gr.Textbox(label="Enter English Text"),
 
456
  gr.Textbox(label="Sanskrit Translation"),
457
  gr.Audio(label="Sanskrit Speech")
458
  ],
459
+ title="Final Year Project: English to Sanskrit Translator (IT 'A' 2021–2025)",
460
  description="Enter a sentence in English to get its Sanskrit translation and audio output."
461
  )
462
 
463
+ # Launch the app
464
  iface.launch()
465
+
466
+ # -------- Cell Separator --------
467
+
468
+
469
+
470
+ # -------- Cell Separator --------
471
+
472
+
473
+
474
+ # -------- Cell Separator --------
475
+
476
+
477
+
478
+ # -------- Cell Separator --------
479
+