Spaces:

citoreh
/

diacritics

Sleeping

App Files Files Community

diacritics / app.py

citoreh

Update app.py

61ff244 verified 3 months ago

raw

history blame contribute delete

23.3 kB

	import gradio as gr
	import re
	from typing import Dict, List, Tuple
	import requests
	import json

	class AdvancedPersianVowelizer:
	"""
	Advanced Persian vowel mark system with multiple improvement strategies.
	"""

	def __init__(self):
	# Persian vowel marks (diacritics)
	self.vowel_marks = {
	'fatha': '\u064E', # َ (a)
	'kasra': '\u0650', # ِ (e/i)
	'damma': '\u064F', # ُ (o/u)
	'sukun': '\u0652', # ْ (silence)
	'shadda': '\u0651', # ّ (doubling)
	'tanwin_fath': '\u064B', # ً
	'tanwin_kasr': '\u064D', # ٍ
	'tanwin_damm': '\u064C', # ٌ
	}

	# Expanded dictionary with more accurate vocalization
	self.expanded_dictionary = {
	# Common nouns with accurate vocalization
	'خانه': 'خانِه', 'مدرسه': 'مَدرَسِه', 'کتاب': 'کِتاب', 'دفتر': 'دَفتَر',
	'قلم': 'قَلَم', 'میز': 'میز', 'صندلی': 'صَندَلی', 'پنجره': 'پَنجَرِه',
	'در': 'دَر', 'دیوار': 'دیوار', 'سقف': 'سَقف', 'زمین': 'زَمین',

	# Family members
	'مادر': 'مادَر', 'پدر': 'پِدَر', 'برادر': 'بَرادَر', 'خواهر': 'خواهَر',
	'پسر': 'پِسَر', 'دختر': 'دُختَر', 'همسر': 'هَمسَر', 'عمو': 'عَمو',
	'عمه': 'عَمِّه', 'دایی': 'دایی', 'خاله': 'خالِه', 'پدربزرگ': 'پِدَربُزُرگ',
	'مادربزرگ': 'مادَربُزُرگ',

	# Time expressions
	'امروز': 'اِمروز', 'دیروز': 'دیروز', 'فردا': 'فَردا', 'صبح': 'صُبح',
	'ظهر': 'ظُهر', 'عصر': 'عَصر', 'شب': 'شَب', 'شام': 'شام',
	'هفته': 'هَفتِه', 'ماه': 'ماه', 'سال': 'سال',

	# Colors
	'سفید': 'سَفید', 'سیاه': 'سیاه', 'قرمز': 'قِرمِز', 'آبی': 'آبی',
	'سبز': 'سَبز', 'زرد': 'زَرد', 'نارنجی': 'نارَنجی', 'بنفش': 'بَنَفش',
	'صورتی': 'صورَتی', 'خاکستری': 'خاکِستَری',

	# Common verbs (infinitive)
	'رفتن': 'رَفتَن', 'آمدن': 'آمَدَن', 'خوردن': 'خوردَن', 'نوشیدن': 'نوشیدَن',
	'خوابیدن': 'خوابیدَن', 'بیدار شدن': 'بیدار شُدَن', 'نشستن': 'نِشَستَن',
	'ایستادن': 'ایستادَن', 'دویدن': 'دَویدَن', 'راه رفتن': 'راه رَفتَن',
	'خواندن': 'خواندَن', 'نوشتن': 'نَویسیدَن', 'گفتن': 'گُفتَن',
	'شنیدن': 'شُنیدَن', 'دیدن': 'دیدَن', 'فهمیدن': 'فَهمیدَن',

	# Common adjectives
	'خوب': 'خوب', 'بد': 'بَد', 'بزرگ': 'بُزُرگ', 'کوچک': 'کوچَک',
	'زیبا': 'زیبا', 'زشت': 'زِشت', 'تازه': 'تازِه', 'کهنه': 'کُهنِه',
	'گرم': 'گَرم', 'سرد': 'سَرد', 'داغ': 'داغ', 'یخ': 'یَخ',
	'شیرین': 'شیرین', 'تلخ': 'تَلخ', 'ترش': 'تُرش', 'شور': 'شور',

	# Pronouns and common words
	'من': 'مَن', 'تو': 'تو', 'او': 'او', 'ما': 'ما', 'شما': 'شُما', 'آنها': 'آن‌ها',
	'این': 'این', 'آن': 'آن', 'اینجا': 'این‌جا', 'آنجا': 'آن‌جا',
	'کجا': 'کَجا', 'چه': 'چِه', 'چرا': 'چَرا', 'چگونه': 'چِگونِه',
	'کی': 'کَی', 'کدام': 'کَدام',

	# Numbers
	'یک': 'یَک', 'دو': 'دو', 'سه': 'سِه', 'چهار': 'چَهار', 'پنج': 'پَنج',
	'شش': 'شِش', 'هفت': 'هَفت', 'هشت': 'هَشت', 'نه': 'نُه', 'ده': 'دَه',

	# Greetings and common phrases
	'سلام': 'سَلام', 'خداحافظ': 'خُداحافِظ', 'ممنون': 'مَمنون', 'متشکرم': 'مُتَشَکِّرَم',
	'بخشید': 'بَخشید', 'ببخشید': 'بِبَخشید', 'خواهش می‌کنم': 'خواهِش می‌کُنَم',
	'چطوری': 'چِطوری', 'حالتان چطور است': 'حالِتان چِطور اَست',
	}

	# Improved morphological patterns
	self.morphological_patterns = [
	# Past tense patterns (more accurate)
	(r'(\w+)یدم$', r'\1یدَم'), # -idam (I did)
	(r'(\w+)یدی$', r'\1یدی'), # -idi (you did)
	(r'(\w+)ید$', r'\1ید'), # -id (he/she did)
	(r'(\w+)یدیم$', r'\1یدیم'), # -idim (we did)
	(r'(\w+)یدید$', r'\1یدید'), # -idid (you all did)
	(r'(\w+)یدند$', r'\1یدَند'), # -idand (they did)

	# Present tense patterns
	(r'^می(\w+)م$', r'می\1َم'), # mi-...am (I do)
	(r'^می(\w+)ی$', r'می\1ی'), # mi-...i (you do)
	(r'^می(\w+)د$', r'می\1َد'), # mi-...ad (he/she does)
	(r'^می(\w+)یم$', r'می\1یم'), # mi-...im (we do)
	(r'^می(\w+)ید$', r'می\1ید'), # mi-...id (you all do)
	(r'^می(\w+)ند$', r'می\1َند'), # mi-...and (they do)

	# Compound verbs
	(r'(\w+)\s+می\s*کنم$', r'\1 می‌کُنَم'),
	(r'(\w+)\s+می\s*کنی$', r'\1 می‌کُنی'),
	(r'(\w+)\s+می\s*کند$', r'\1 می‌کُنَد'),
	(r'(\w+)\s+می\s*کنیم$', r'\1 می‌کُنیم'),
	(r'(\w+)\s+می\s*کنید$', r'\1 می‌کُنید'),
	(r'(\w+)\s+می\s*کنند$', r'\1 می‌کُنَند'),

	# Plural patterns
	(r'(\w+)ها$', r'\1‌ها'), # -ha plural
	(r'(\w+)ان$', r'\1ان'), # -an plural
	(r'(\w+)ات$', r'\1ات'), # -at plural

	# Possessive patterns
	(r'(\w+)م$', r'\1َم'), # my
	(r'(\w+)ت$', r'\1َت'), # your
	(r'(\w+)ش$', r'\1َش'), # his/her
	(r'(\w+)مان$', r'\1ِمان'), # our
	(r'(\w+)تان$', r'\1ِتان'), # your (plural)
	(r'(\w+)شان$', r'\1ِشان'), # their
	]

	# Context-aware rules
	self.context_rules = [
	# Prepositions usually get kasra
	(r'^(از\|به\|با\|در\|تا\|برای\|روی\|زیر\|کنار)$', r'\1'),

	# Common prefixes
	(r'^(ناخوش)', r'ناخوش'),
	(r'^(بی)', r'بی'),
	(r'^(هم)', r'هَم'),

	# Ezafe construction (more accurate)
	(r'(\w+)\s+(\w+)', self._handle_ezafe),
	]

	def _handle_ezafe(self, match) -> str:
	"""Handle Persian ezafe construction"""
	word1, word2 = match.groups()
	# Add kasra for ezafe if first word doesn't end in silent letter
	if word1[-1] not in ['ه', 'و']:
	return f"{word1}ِ {word2}"
	return f"{word1} {word2}"

	def _get_word_stress_pattern(self, word: str) -> str:
	"""
	Determine stress pattern for better vowel placement
	"""
	# Simple stress rules for Persian
	if len(word) <= 2:
	return word

	# Most Persian words have penultimate stress
	# Add fatha to the stressed syllable
	if len(word) >= 3:
	# Find consonant clusters and add appropriate vowels
	result = ""
	for i, char in enumerate(word):
	result += char
	if i < len(word) - 1: # Not the last character
	current_char = word[i]
	next_char = word[i + 1]
	# Add vowel between consonants
	if self._is_consonant(current_char) and self._is_consonant(next_char):
	# Choose vowel based on position and context
	if i < len(word) // 2:
	result += self.vowel_marks['fatha'] # َ
	else:
	result += self.vowel_marks['kasra'] # ِ
	return result
	return word

	def _is_consonant(self, char: str) -> bool:
	"""Check if character is a Persian consonant"""
	consonants = 'بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'
	return char in consonants

	def _apply_phonological_rules(self, word: str) -> str:
	"""Apply Persian phonological rules"""
	result = word

	# Rule: Short vowels in closed syllables
	result = re.sub(r'([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])',
	r'\1َ\2', result)

	# Rule: Long vowels remain unchanged
	result = re.sub(r'([آاوی])', r'\1', result)

	# Rule: Silent letters (sukun)
	result = re.sub(r'([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])$', r'\1ْ', result)

	return result

	def add_vowel_marks_advanced(self, text: str) -> str:
	"""
	Advanced vowel marking with multiple strategies
	"""
	if not text.strip():
	return text

	# Preprocessing: normalize text
	text = self._normalize_text(text)

	# Split into sentences for better context
	sentences = re.split(r'[.!?؟।]', text)
	vocalized_sentences = []

	for sentence in sentences:
	if sentence.strip():
	vocalized_sentence = self._vocalize_sentence(sentence.strip())
	vocalized_sentences.append(vocalized_sentence)

	return '. '.join(vocalized_sentences)

	def _normalize_text(self, text: str) -> str:
	"""Normalize Persian text"""
	# Replace Arabic characters with Persian equivalents
	replacements = {
	'ي': 'ی', # Arabic yeh to Persian yeh
	'ك': 'ک', # Arabic kaf to Persian kaf
	'ة': 'ه', # Arabic teh marbuta to heh
	}

	for arabic, persian in replacements.items():
	text = text.replace(arabic, persian)

	return text

	def _vocalize_sentence(self, sentence: str) -> str:
	"""Vocalize a sentence with context awareness"""
	words = sentence.split()
	vocalized_words = []

	for i, word in enumerate(words):
	# Clean word
	clean_word = re.sub(r'[^\u0600-\u06FF]', '', word)

	if not clean_word:
	vocalized_words.append(word)
	continue

	# Strategy 1: Dictionary lookup
	if clean_word in self.expanded_dictionary:
	vocalized_word = word.replace(clean_word, self.expanded_dictionary[clean_word])
	else:
	# Strategy 2: Morphological analysis
	vocalized_word = self._apply_morphological_patterns(word)

	# Strategy 3: Phonological rules
	if vocalized_word == word: # If no morphological pattern matched
	clean_vocalized = self._apply_phonological_rules(clean_word)
	vocalized_word = word.replace(clean_word, clean_vocalized)

	# Strategy 4: Context-based rules
	vocalized_word = self._apply_context_rules(vocalized_word, i, words)

	vocalized_words.append(vocalized_word)

	return ' '.join(vocalized_words)

	def _apply_morphological_patterns(self, word: str) -> str:
	"""Apply morphological patterns"""
	result = word
	for pattern, replacement in self.morphological_patterns:
	if callable(replacement):
	result = re.sub(pattern, replacement, result)
	else:
	result = re.sub(pattern, replacement, result)
	return result

	def _apply_context_rules(self, word: str, position: int, sentence_words: List[str]) -> str:
	"""Apply context-aware rules"""
	result = word

	# Check if word is at beginning of sentence
	if position == 0:
	# Sentence-initial rules
	pass

	# Check if word is followed by specific words
	if position < len(sentence_words) - 1:
	next_word = sentence_words[position + 1]
	# Add ezafe rules here

	return result

	# Integration with Hugging Face models (optional)
	def try_huggingface_model(text: str) -> str:
	"""
	Try to use a pre-trained model from Hugging Face
	This is a placeholder for actual model integration
	"""
	try:
	# Example: You could integrate with models like:
	# - HooshvareLab/bert-base-parsbert-uncased
	# - m3hrdadfi/albert-base-fa
	# For now, we'll return the text as-is
	return text
	except Exception as e:
	print(f"HF Model error: {e}")
	return text

	def vocalize_persian_text_advanced(text: str, use_hf_model: bool = False) -> str:
	"""
	Advanced Persian text vocalization with multiple strategies
	"""
	if not text.strip():
	return "لطفاً متنی وارد کنید."

	vowelizer = AdvancedPersianVowelizer()

	# Strategy 1: Use HF model if available and requested
	if use_hf_model:
	try:
	result = try_huggingface_model(text)
	if result != text: # If model provided vocalization
	return result
	except:
	pass # Fall back to rule-based approach

	# Strategy 2: Advanced rule-based approach
	vocalized_text = vowelizer.add_vowel_marks_advanced(text)

	return vocalized_text

	# Enhanced Custom CSS
	custom_css = """
	@import url('https://cdn.jsdelivr.net/gh/rastikerdar/[email protected]/dist/font-face.css');

	.rtl-text {
	direction: rtl !important;
	text-align: right !important;
	font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important;
	font-size: 18px !important;
	line-height: 1.8 !important;
	}

	.rtl-text textarea {
	direction: rtl !important;
	text-align: right !important;
	font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important;
	font-size: 18px !important;
	line-height: 1.8 !important;
	padding: 15px !important;
	}

	.output-box {
	direction: rtl !important;
	text-align: right !important;
	font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important;
	font-size: 20px !important;
	background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%) !important;
	padding: 20px !important;
	border-radius: 12px !important;
	border: 2px solid #28a745 !important;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important;
	min-height: 120px !important;
	line-height: 2 !important;
	}

	.gradio-container {
	max-width: 1200px !important;
	margin: auto !important;
	}

	.title-section {
	text-align: center !important;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
	color: white !important;
	padding: 30px !important;
	border-radius: 15px !important;
	margin-bottom: 30px !important;
	box-shadow: 0 8px 15px rgba(0, 0, 0, 0.1) !important;
	}

	.improvement-info {
	background-color: #e3f2fd !important;
	border: 1px solid #2196f3 !important;
	border-radius: 10px !important;
	padding: 20px !important;
	margin: 20px 0 !important;
	}
	"""

	# Create advanced Gradio interface
	with gr.Blocks(css=custom_css, title="Advanced Persian Vowel Marks Generator", theme=gr.themes.Soft()) as demo:
	gr.HTML("""
	<div class="title-section">
	<h1>🎯 سیستم پیشرفته افزودن اعراب به متن فارسی</h1>
	<h2>Advanced Persian Vowel Marks Generator</h2>
	<p>ویژگی‌های جدید: واژگان گسترده، تحلیل صرفی، قوانین آوایی</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="📝 متن فارسی (Persian Text Input)",
	placeholder="متن فارسی خود را اینجا وارد کنید...\nمثال: امروز به مدرسه رفتم و کتاب خریدم",
	lines=6,
	elem_classes=["rtl-text"]
	)

	with gr.Row():
	submit_btn = gr.Button("🎯 افزودن اعراب پیشرفته", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ پاک کردن", variant="secondary")

	# Advanced options
	with gr.Accordion("⚙️ تنظیمات پیشرفته (Advanced Settings)", open=False):
	use_hf_model = gr.Checkbox(
	label="استفاده از مدل هوش مصنوعی (Use AI Model)",
	value=False,
	info="در صورت در دسترس بودن از مدل‌های پیش‌آموزش یافته استفاده کند"
	)

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="✨ متن با اعراب (Text with Vowel Marks)",
	lines=6,
	elem_classes=["rtl-text", "output-box"],
	interactive=False
	)

	# Improvements info
	gr.HTML("""
	<div class="improvement-info">
	<h3>🚀 بهبودهای اعمال شده:</h3>
	<div style="direction: rtl; text-align: right;">
	<ul>
	<li><strong>واژگان گسترده:</strong> بیش از 200 کلمه رایج با اعراب صحیح</li>
	<li><strong>تحلیل صرفی:</strong> تشخیص انواع فعل، اسم، صفت و ضمایر</li>
	<li><strong>قوانین آوایی:</strong> اعمال قوانین تکیه و آواشناسی فارسی</li>
	<li><strong>تحلیل ساختاری:</strong> تشخیص اضافه و ترکیبات</li>
	<li><strong>نرمال‌سازی متن:</strong> تبدیل حروف عربی به فارسی</li>
	<li><strong>پردازش زمینه‌ای:</strong> در نظر گیری کلمات مجاور</li>
	</ul>
	</div>
	</div>
	""")

	# Enhanced examples
	enhanced_examples = [
	"امروز صبح به مدرسه رفتم و کتاب جدیدی خریدم",
	"مادرم برای من غذای خوشمزه درست کرد",
	"فردا با دوستانم به پارک خواهیم رفت",
	"استاد درس جالبی در کلاس تدریس کرد",
	"کتابخانه شهر کتاب‌های زیادی دارد",
	"بچه‌ها در حیاط مدرسه بازی می‌کنند",
	"هوا سرد است و برف می‌بارد"
	]

	gr.HTML("<h3 style='text-align: center; margin-top: 30px;'>🔸 نمونه متن‌های پیشرفته</h3>")

	examples = gr.Examples(
	examples=enhanced_examples,
	inputs=input_text,
	outputs=output_text,
	fn=lambda x: vocalize_persian_text_advanced(x, False),
	cache_examples=False,
	label="نمونه‌های پیشرفته"
	)

	# Detailed guide
	with gr.Accordion("📚 راهنمای جامع (Comprehensive Guide)", open=False):
	gr.HTML("""
	<div style="direction: rtl; text-align: right; font-family: Vazir, Tahoma; line-height: 1.8;">
	<h4>🔹 ویژگی‌های جدید:</h4>
	<ul>
	<li><strong>تحلیل صرفی:</strong> تشخیص دقیق انواع کلمات و صرف آنها</li>
	<li><strong>قوانین آوایی:</strong> اعمال قوانین تکیه و تغییرات آوایی</li>
	<li><strong>پردازش زمینه‌ای:</strong> در نظر گیری کلمات قبل و بعد</li>
	<li><strong>واژگان گسترده:</strong> پوشش کلمات رایج و تخصصی</li>
	</ul>

	<h4>🔹 روش‌های بهبود دقت:</h4>
	<ol>
	<li><strong>استفاده از مدل‌های پیش‌آموزش یافته:</strong> BERT-fa, ParsBERT</li>
	<li><strong>یادگیری عمیق:</strong> شبکه‌های عصبی برای تشخیص الگو</li>
	<li><strong>پردازش زبان طبیعی:</strong> تحلیل نحوی و معنایی</li>
	<li><strong>داده‌های بیشتر:</strong> آموزش روی متون بزرگ</li>
	<li><strong>بازخورد کاربر:</strong> تصحیح مداوم بر اساس خطاها</li>
	</ol>

	<h4>🔹 نکات مهم:</h4>
	<ul>
	<li>این سیستم بهبود یافته دقت بالاتری دارد</li>
	<li>برای متون تخصصی ممکن است نیاز به تنظیم باشد</li>
	<li>اعراب اضافه شده بر اساس استانداردهای فارسی است</li>
	<li>برای بهبود بیشتر می‌توان از مدل‌های هوش مصنوعی استفاده کرد</li>
	</ul>

	<h4>🔹 پیشنهادات برای توسعه:</h4>
	<ul>
	<li>اتصال به مدل‌های Transformer پیش‌آموزش یافته</li>
	<li>ایجاد API برای استفاده در برنامه‌های دیگر</li>
	<li>افزودن قابلیت تشخیص و تصحیح خطاهای املایی</li>
	<li>پشتیبانی از انواع مختلف متن (شعر، نثر، خبر)</li>
	</ul>
	</div>
	""")

	# Event handlers
	submit_btn.click(
	fn=lambda text, use_model: vocalize_persian_text_advanced(text, use_model),
	inputs=[input_text, use_hf_model],
	outputs=output_text
	)

	clear_btn.click(
	fn=lambda: ("", ""),
	outputs=[input_text, output_text]
	)

	input_text.submit(
	fn=lambda text, use_model: vocalize_persian_text_advanced(text, use_model),
	inputs=[input_text, use_hf_model],
	outputs=output_text
	)

	# Launch configuration
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	debug=True
	)