import gradio as gr import re from typing import Dict, List, Tuple import requests import json class AdvancedPersianVowelizer: """ Advanced Persian vowel mark system with multiple improvement strategies. """ def __init__(self): # Persian vowel marks (diacritics) self.vowel_marks = { 'fatha': '\u064E', # َ (a) 'kasra': '\u0650', # ِ (e/i) 'damma': '\u064F', # ُ (o/u) 'sukun': '\u0652', # ْ (silence) 'shadda': '\u0651', # ّ (doubling) 'tanwin_fath': '\u064B', # ً 'tanwin_kasr': '\u064D', # ٍ 'tanwin_damm': '\u064C', # ٌ } # Expanded dictionary with more accurate vocalization self.expanded_dictionary = { # Common nouns with accurate vocalization 'خانه': 'خانِه', 'مدرسه': 'مَدرَسِه', 'کتاب': 'کِتاب', 'دفتر': 'دَفتَر', 'قلم': 'قَلَم', 'میز': 'میز', 'صندلی': 'صَندَلی', 'پنجره': 'پَنجَرِه', 'در': 'دَر', 'دیوار': 'دیوار', 'سقف': 'سَقف', 'زمین': 'زَمین', # Family members 'مادر': 'مادَر', 'پدر': 'پِدَر', 'برادر': 'بَرادَر', 'خواهر': 'خواهَر', 'پسر': 'پِسَر', 'دختر': 'دُختَر', 'همسر': 'هَمسَر', 'عمو': 'عَمو', 'عمه': 'عَمِّه', 'دایی': 'دایی', 'خاله': 'خالِه', 'پدربزرگ': 'پِدَربُزُرگ', 'مادربزرگ': 'مادَربُزُرگ', # Time expressions 'امروز': 'اِمروز', 'دیروز': 'دیروز', 'فردا': 'فَردا', 'صبح': 'صُبح', 'ظهر': 'ظُهر', 'عصر': 'عَصر', 'شب': 'شَب', 'شام': 'شام', 'هفته': 'هَفتِه', 'ماه': 'ماه', 'سال': 'سال', # Colors 'سفید': 'سَفید', 'سیاه': 'سیاه', 'قرمز': 'قِرمِز', 'آبی': 'آبی', 'سبز': 'سَبز', 'زرد': 'زَرد', 'نارنجی': 'نارَنجی', 'بنفش': 'بَنَفش', 'صورتی': 'صورَتی', 'خاکستری': 'خاکِستَری', # Common verbs (infinitive) 'رفتن': 'رَفتَن', 'آمدن': 'آمَدَن', 'خوردن': 'خوردَن', 'نوشیدن': 'نوشیدَن', 'خوابیدن': 'خوابیدَن', 'بیدار شدن': 'بیدار شُدَن', 'نشستن': 'نِشَستَن', 'ایستادن': 'ایستادَن', 'دویدن': 'دَویدَن', 'راه رفتن': 'راه رَفتَن', 'خواندن': 'خواندَن', 'نوشتن': 'نَویسیدَن', 'گفتن': 'گُفتَن', 'شنیدن': 'شُنیدَن', 'دیدن': 'دیدَن', 'فهمیدن': 'فَهمیدَن', # Common adjectives 'خوب': 'خوب', 'بد': 'بَد', 'بزرگ': 'بُزُرگ', 'کوچک': 'کوچَک', 'زیبا': 'زیبا', 'زشت': 'زِشت', 'تازه': 'تازِه', 'کهنه': 'کُهنِه', 'گرم': 'گَرم', 'سرد': 'سَرد', 'داغ': 'داغ', 'یخ': 'یَخ', 'شیرین': 'شیرین', 'تلخ': 'تَلخ', 'ترش': 'تُرش', 'شور': 'شور', # Pronouns and common words 'من': 'مَن', 'تو': 'تو', 'او': 'او', 'ما': 'ما', 'شما': 'شُما', 'آنها': 'آنها', 'این': 'این', 'آن': 'آن', 'اینجا': 'اینجا', 'آنجا': 'آنجا', 'کجا': 'کَجا', 'چه': 'چِه', 'چرا': 'چَرا', 'چگونه': 'چِگونِه', 'کی': 'کَی', 'کدام': 'کَدام', # Numbers 'یک': 'یَک', 'دو': 'دو', 'سه': 'سِه', 'چهار': 'چَهار', 'پنج': 'پَنج', 'شش': 'شِش', 'هفت': 'هَفت', 'هشت': 'هَشت', 'نه': 'نُه', 'ده': 'دَه', # Greetings and common phrases 'سلام': 'سَلام', 'خداحافظ': 'خُداحافِظ', 'ممنون': 'مَمنون', 'متشکرم': 'مُتَشَکِّرَم', 'بخشید': 'بَخشید', 'ببخشید': 'بِبَخشید', 'خواهش میکنم': 'خواهِش میکُنَم', 'چطوری': 'چِطوری', 'حالتان چطور است': 'حالِتان چِطور اَست', } # Improved morphological patterns self.morphological_patterns = [ # Past tense patterns (more accurate) (r'(\w+)یدم$', r'\1یدَم'), # -idam (I did) (r'(\w+)یدی$', r'\1یدی'), # -idi (you did) (r'(\w+)ید$', r'\1ید'), # -id (he/she did) (r'(\w+)یدیم$', r'\1یدیم'), # -idim (we did) (r'(\w+)یدید$', r'\1یدید'), # -idid (you all did) (r'(\w+)یدند$', r'\1یدَند'), # -idand (they did) # Present tense patterns (r'^می(\w+)م$', r'می\1َم'), # mi-...am (I do) (r'^می(\w+)ی$', r'می\1ی'), # mi-...i (you do) (r'^می(\w+)د$', r'می\1َد'), # mi-...ad (he/she does) (r'^می(\w+)یم$', r'می\1یم'), # mi-...im (we do) (r'^می(\w+)ید$', r'می\1ید'), # mi-...id (you all do) (r'^می(\w+)ند$', r'می\1َند'), # mi-...and (they do) # Compound verbs (r'(\w+)\s+می\s*کنم$', r'\1 میکُنَم'), (r'(\w+)\s+می\s*کنی$', r'\1 میکُنی'), (r'(\w+)\s+می\s*کند$', r'\1 میکُنَد'), (r'(\w+)\s+می\s*کنیم$', r'\1 میکُنیم'), (r'(\w+)\s+می\s*کنید$', r'\1 میکُنید'), (r'(\w+)\s+می\s*کنند$', r'\1 میکُنَند'), # Plural patterns (r'(\w+)ها$', r'\1ها'), # -ha plural (r'(\w+)ان$', r'\1ان'), # -an plural (r'(\w+)ات$', r'\1ات'), # -at plural # Possessive patterns (r'(\w+)م$', r'\1َم'), # my (r'(\w+)ت$', r'\1َت'), # your (r'(\w+)ش$', r'\1َش'), # his/her (r'(\w+)مان$', r'\1ِمان'), # our (r'(\w+)تان$', r'\1ِتان'), # your (plural) (r'(\w+)شان$', r'\1ِشان'), # their ] # Context-aware rules self.context_rules = [ # Prepositions usually get kasra (r'^(از|به|با|در|تا|برای|روی|زیر|کنار)$', r'\1'), # Common prefixes (r'^(ناخوش)', r'ناخوش'), (r'^(بی)', r'بی'), (r'^(هم)', r'هَم'), # Ezafe construction (more accurate) (r'(\w+)\s+(\w+)', self._handle_ezafe), ] def _handle_ezafe(self, match) -> str: """Handle Persian ezafe construction""" word1, word2 = match.groups() # Add kasra for ezafe if first word doesn't end in silent letter if word1[-1] not in ['ه', 'و']: return f"{word1}ِ {word2}" return f"{word1} {word2}" def _get_word_stress_pattern(self, word: str) -> str: """ Determine stress pattern for better vowel placement """ # Simple stress rules for Persian if len(word) <= 2: return word # Most Persian words have penultimate stress # Add fatha to the stressed syllable if len(word) >= 3: # Find consonant clusters and add appropriate vowels result = "" for i, char in enumerate(word): result += char if i < len(word) - 1: # Not the last character current_char = word[i] next_char = word[i + 1] # Add vowel between consonants if self._is_consonant(current_char) and self._is_consonant(next_char): # Choose vowel based on position and context if i < len(word) // 2: result += self.vowel_marks['fatha'] # َ else: result += self.vowel_marks['kasra'] # ِ return result return word def _is_consonant(self, char: str) -> bool: """Check if character is a Persian consonant""" consonants = 'بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی' return char in consonants def _apply_phonological_rules(self, word: str) -> str: """Apply Persian phonological rules""" result = word # Rule: Short vowels in closed syllables result = re.sub(r'([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])', r'\1َ\2', result) # Rule: Long vowels remain unchanged result = re.sub(r'([آاوی])', r'\1', result) # Rule: Silent letters (sukun) result = re.sub(r'([بپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی])$', r'\1ْ', result) return result def add_vowel_marks_advanced(self, text: str) -> str: """ Advanced vowel marking with multiple strategies """ if not text.strip(): return text # Preprocessing: normalize text text = self._normalize_text(text) # Split into sentences for better context sentences = re.split(r'[.!?؟।]', text) vocalized_sentences = [] for sentence in sentences: if sentence.strip(): vocalized_sentence = self._vocalize_sentence(sentence.strip()) vocalized_sentences.append(vocalized_sentence) return '. '.join(vocalized_sentences) def _normalize_text(self, text: str) -> str: """Normalize Persian text""" # Replace Arabic characters with Persian equivalents replacements = { 'ي': 'ی', # Arabic yeh to Persian yeh 'ك': 'ک', # Arabic kaf to Persian kaf 'ة': 'ه', # Arabic teh marbuta to heh } for arabic, persian in replacements.items(): text = text.replace(arabic, persian) return text def _vocalize_sentence(self, sentence: str) -> str: """Vocalize a sentence with context awareness""" words = sentence.split() vocalized_words = [] for i, word in enumerate(words): # Clean word clean_word = re.sub(r'[^\u0600-\u06FF]', '', word) if not clean_word: vocalized_words.append(word) continue # Strategy 1: Dictionary lookup if clean_word in self.expanded_dictionary: vocalized_word = word.replace(clean_word, self.expanded_dictionary[clean_word]) else: # Strategy 2: Morphological analysis vocalized_word = self._apply_morphological_patterns(word) # Strategy 3: Phonological rules if vocalized_word == word: # If no morphological pattern matched clean_vocalized = self._apply_phonological_rules(clean_word) vocalized_word = word.replace(clean_word, clean_vocalized) # Strategy 4: Context-based rules vocalized_word = self._apply_context_rules(vocalized_word, i, words) vocalized_words.append(vocalized_word) return ' '.join(vocalized_words) def _apply_morphological_patterns(self, word: str) -> str: """Apply morphological patterns""" result = word for pattern, replacement in self.morphological_patterns: if callable(replacement): result = re.sub(pattern, replacement, result) else: result = re.sub(pattern, replacement, result) return result def _apply_context_rules(self, word: str, position: int, sentence_words: List[str]) -> str: """Apply context-aware rules""" result = word # Check if word is at beginning of sentence if position == 0: # Sentence-initial rules pass # Check if word is followed by specific words if position < len(sentence_words) - 1: next_word = sentence_words[position + 1] # Add ezafe rules here return result # Integration with Hugging Face models (optional) def try_huggingface_model(text: str) -> str: """ Try to use a pre-trained model from Hugging Face This is a placeholder for actual model integration """ try: # Example: You could integrate with models like: # - HooshvareLab/bert-base-parsbert-uncased # - m3hrdadfi/albert-base-fa # For now, we'll return the text as-is return text except Exception as e: print(f"HF Model error: {e}") return text def vocalize_persian_text_advanced(text: str, use_hf_model: bool = False) -> str: """ Advanced Persian text vocalization with multiple strategies """ if not text.strip(): return "لطفاً متنی وارد کنید." vowelizer = AdvancedPersianVowelizer() # Strategy 1: Use HF model if available and requested if use_hf_model: try: result = try_huggingface_model(text) if result != text: # If model provided vocalization return result except: pass # Fall back to rule-based approach # Strategy 2: Advanced rule-based approach vocalized_text = vowelizer.add_vowel_marks_advanced(text) return vocalized_text # Enhanced Custom CSS custom_css = """ @import url('https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font@v30.1.0/dist/font-face.css'); .rtl-text { direction: rtl !important; text-align: right !important; font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important; font-size: 18px !important; line-height: 1.8 !important; } .rtl-text textarea { direction: rtl !important; text-align: right !important; font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important; font-size: 18px !important; line-height: 1.8 !important; padding: 15px !important; } .output-box { direction: rtl !important; text-align: right !important; font-family: 'Vazir', 'B Nazanin', 'Tahoma', 'Arial Unicode MS', sans-serif !important; font-size: 20px !important; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%) !important; padding: 20px !important; border-radius: 12px !important; border: 2px solid #28a745 !important; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1) !important; min-height: 120px !important; line-height: 2 !important; } .gradio-container { max-width: 1200px !important; margin: auto !important; } .title-section { text-align: center !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; color: white !important; padding: 30px !important; border-radius: 15px !important; margin-bottom: 30px !important; box-shadow: 0 8px 15px rgba(0, 0, 0, 0.1) !important; } .improvement-info { background-color: #e3f2fd !important; border: 1px solid #2196f3 !important; border-radius: 10px !important; padding: 20px !important; margin: 20px 0 !important; } """ # Create advanced Gradio interface with gr.Blocks(css=custom_css, title="Advanced Persian Vowel Marks Generator", theme=gr.themes.Soft()) as demo: gr.HTML("""
ویژگیهای جدید: واژگان گسترده، تحلیل صرفی، قوانین آوایی