Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7 | """Apostrophe-aware segmentation for Turkish text. | |
| Handles two distinct cases: | |
| 1. **Turkish proper names** — İstanbul'da, Ankara'ya | |
| → ROOT(İstanbul) + PUNCT(') + SUFFIX(da) | |
| 2. **Foreign stems with Turkish suffixes** — meeting'e, zoom'da | |
| → FOREIGN(meeting) + SUFFIX(e) | |
| The decision between these two cases uses: | |
| - Turkish character detection (ç,ğ,ı,ş,ö,ü → Turkish) | |
| - TDK dictionary lookup | |
| - Proper noun list | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from ._suffix_table import APOSTROPHE_SUFFIXES, SUFFIX_MAP | |
| from .normalization import has_turkish_chars, turkish_lower | |
| from .resources import load_proper_nouns, load_tdk_words | |
| # Matches word'suffix patterns (both ASCII and Unicode apostrophes) | |
| _APO_RE = re.compile( | |
| r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b" | |
| ) | |
| def is_turkish_base(word: str) -> bool: | |
| """Return True if *word* should be treated as a Turkish word. | |
| Used to decide whether ``word'suffix`` is a Turkish proper name | |
| (keep apostrophe as punctuation boundary) or a foreign word | |
| (merge into FOREIGN root + SUFFIX). | |
| Decision order: | |
| 1. Turkish-specific chars → definitely Turkish | |
| 2. Proper nouns list → Turkish | |
| 3. TDK dictionary → Turkish (or accepted loanword) | |
| 4. Very short words (< 4 chars) → assume Turkish (because short | |
| words are ambiguous and Turkish short words are common) | |
| """ | |
| wl = turkish_lower(word) | |
| # Turkish-specific characters are a strong signal | |
| if has_turkish_chars(wl): | |
| return True | |
| # Known proper nouns | |
| if wl in load_proper_nouns(): | |
| return True | |
| # TDK dictionary | |
| tdk = load_tdk_words() | |
| if tdk and wl in tdk: | |
| return True | |
| # Very short words are ambiguous — default to Turkish | |
| return len(wl) < 4 | |
| def split_apostrophe_words( | |
| text: str, | |
| ) -> tuple[str, list[tuple[str, str]]]: | |
| """Process apostrophe patterns in *text*. | |
| For **foreign** stems followed by a Turkish suffix after apostrophe, | |
| replaces the apostrophe with a space so the word can later be | |
| segmented as FOREIGN ROOT + SUFFIX. | |
| For **Turkish** proper names (İstanbul'da), leaves the text | |
| unchanged — the apostrophe will be handled as punctuation by the | |
| word splitter. | |
| Returns: | |
| ``(modified_text, [(foreign_base_lower, suffix_lower), ...])`` | |
| """ | |
| foreign_splits: list[tuple[str, str]] = [] | |
| def _repl(m: re.Match) -> str: | |
| base, suffix = m.group(1), m.group(2) | |
| if is_turkish_base(base): | |
| return m.group(0) # Keep apostrophe for Turkish names | |
| sl = suffix.lower() | |
| if any(sl == s for s in APOSTROPHE_SUFFIXES): | |
| foreign_splits.append((turkish_lower(base), sl)) | |
| return f"{base} {suffix}" # Drop apostrophe → space | |
| return m.group(0) | |
| modified = _APO_RE.sub(_repl, text) | |
| return modified, foreign_splits | |
| def build_apostrophe_tokens( | |
| word: str, suffix_str: str, *, is_foreign: bool | |
| ) -> list[dict[str, object]]: | |
| """Create token dicts for a word + apostrophe + suffix pattern. | |
| Args: | |
| word: The base word (before apostrophe). | |
| suffix_str: The suffix string (after apostrophe). | |
| is_foreign: Whether the base word is foreign. | |
| Returns: | |
| List of token dicts. | |
| """ | |
| label = SUFFIX_MAP.get(suffix_str.lower(), "-SFX") | |
| if is_foreign: | |
| # Foreign: FOREIGN(word) + SUFFIX(suffix) | |
| return [ | |
| { | |
| "token": f" {word}", "token_type": "FOREIGN", "morph_pos": 0, | |
| "_foreign": True, | |
| }, | |
| { | |
| "token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1, | |
| "_apo_suffix": True, "_suffix_label": label, | |
| }, | |
| ] | |
| else: | |
| # Turkish: ROOT(word) + PUNCT(') + SUFFIX(suffix) | |
| return [ | |
| { | |
| "token": f" {word}", "token_type": "ROOT", "morph_pos": 0, | |
| }, | |
| { | |
| "token": "'", "token_type": "PUNCT", "morph_pos": 0, | |
| "_punct": True, | |
| }, | |
| { | |
| "token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1, | |
| "_apo_suffix": True, "_suffix_label": label, | |
| }, | |
| ] | |