"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer. A zero-dependency Turkish tokenizer that segments text into morphologically meaningful tokens using deterministic heuristics, a bundled TDK dictionary, and a candidate-based segmentation engine. Usage:: from nedo_turkish_tokenizer import NedoTurkishTokenizer tok = NedoTurkishTokenizer() tokens = tok.tokenize("İstanbul'da meeting'e katılamadım") for t in tokens: print(t["token"], t["token_type"], t["morph_pos"]) Output fields per token: token : str — token string (leading space = word-initial) token_type : str — ROOT | SUFFIX | FOREIGN | PUNCT | NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI | ACRONYM morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix… (+ optional _* metadata fields) """ from __future__ import annotations import os import multiprocessing from concurrent.futures import ProcessPoolExecutor, as_completed from .engine import TokenizationEngine from .types import SPECIAL_TYPES # ── Parallel worker helpers ────────────────────────────────────────────────── _worker_tok: "NedoTurkishTokenizer | None" = None def _init_worker() -> None: global _worker_tok _worker_tok = NedoTurkishTokenizer() def _tokenize_one(text: str) -> list[dict]: assert _worker_tok is not None return _worker_tok.tokenize(text) # ══════════════════════════════════════════════════════════════════════════════ class NedoTurkishTokenizer: """Self-contained Turkish morphological tokenizer. Requires **no external dependencies** — all tokenization logic, dictionaries, and heuristics are bundled within the package. Example:: from nedo_turkish_tokenizer import NedoTurkishTokenizer tok = NedoTurkishTokenizer() tokens = tok("İstanbul'da meeting'e katılamadım") for t in tokens: print(t["token"], t["token_type"], t["morph_pos"]) """ def __init__(self) -> None: self._engine = TokenizationEngine() # ── Public API ───────────────────────────────────────────────────────── def __call__(self, text: str) -> list[dict]: """Shorthand for ``tokenize(text)``.""" return self.tokenize(text) def tokenize(self, text: str) -> list[dict]: """Tokenize a single text string. Returns a list of token dicts, each containing at minimum: ``token``, ``token_type``, ``morph_pos``, plus optional ``_*`` metadata fields. """ return self._engine.tokenize(text) def batch_tokenize( self, texts: list[str], workers: int | None = None, chunk_size: int = 64, ) -> list[list[dict]]: """Tokenize a list of texts in parallel. Args: texts: List of strings to tokenize. workers: Number of worker processes (``None`` = all CPUs). chunk_size: Below this count, run sequentially. Returns: List of token lists, in the same order as *texts*. """ if not texts: return [] n = workers or os.cpu_count() or 4 if len(texts) <= chunk_size or n == 1: return [self.tokenize(t) for t in texts] results: list[list[dict] | None] = [None] * len(texts) with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool: futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)} for fut in as_completed(futs): i = futs[fut] try: results[i] = fut.result() except Exception as exc: # Fallback: tokenize in the main process results[i] = self.tokenize(texts[i]) print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}") return results # type: ignore[return-value] # ── Statistics ───────────────────────────────────────────────────────── def stats(self, tokens: list[dict]) -> dict: """Compute morphological coverage statistics for a token list.""" total = len(tokens) if total == 0: return {k: 0 for k in ( "total", "roots", "suffixes", "foreign", "punct", "special", "tr_pct", "pure_pct", )} roots = sum(1 for t in tokens if t["token_type"] == "ROOT") suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX") foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN") punct = sum(1 for t in tokens if t["token_type"] == "PUNCT") special = sum(1 for t in tokens if t["token_type"] in SPECIAL_TYPES) tr = roots + suffixes + foreign + punct + special pure = sum( 1 for t in tokens if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN") and not t["token"].strip().startswith("<") ) return { "total": total, "roots": roots, "suffixes": suffixes, "foreign": foreign, "punct": punct, "special": special, "tr_pct": round(tr / total * 100, 2), "pure_pct": round(pure / total * 100, 2), }