Ethosoft
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
edec8b7 | """NedoTurkishTokenizer β self-contained Turkish morphological tokenizer. | |
| A zero-dependency Turkish tokenizer that segments text into | |
| morphologically meaningful tokens using deterministic heuristics, | |
| a bundled TDK dictionary, and a candidate-based segmentation engine. | |
| Usage:: | |
| from nedo_turkish_tokenizer import NedoTurkishTokenizer | |
| tok = NedoTurkishTokenizer() | |
| tokens = tok.tokenize("Δ°stanbul'da meeting'e katΔ±lamadΔ±m") | |
| for t in tokens: | |
| print(t["token"], t["token_type"], t["morph_pos"]) | |
| Output fields per token: | |
| token : str β token string (leading space = word-initial) | |
| token_type : str β ROOT | SUFFIX | FOREIGN | PUNCT | | |
| NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI | ACRONYM | |
| morph_pos : int β 0=root/word-initial, 1=first suffix, 2=second suffixβ¦ | |
| (+ optional _* metadata fields) | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import multiprocessing | |
| from concurrent.futures import ProcessPoolExecutor, as_completed | |
| from .engine import TokenizationEngine | |
| from .types import SPECIAL_TYPES | |
| # ββ Parallel worker helpers ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _worker_tok: "NedoTurkishTokenizer | None" = None | |
| def _init_worker() -> None: | |
| global _worker_tok | |
| _worker_tok = NedoTurkishTokenizer() | |
| def _tokenize_one(text: str) -> list[dict]: | |
| assert _worker_tok is not None | |
| return _worker_tok.tokenize(text) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class NedoTurkishTokenizer: | |
| """Self-contained Turkish morphological tokenizer. | |
| Requires **no external dependencies** β all tokenization logic, | |
| dictionaries, and heuristics are bundled within the package. | |
| Example:: | |
| from nedo_turkish_tokenizer import NedoTurkishTokenizer | |
| tok = NedoTurkishTokenizer() | |
| tokens = tok("Δ°stanbul'da meeting'e katΔ±lamadΔ±m") | |
| for t in tokens: | |
| print(t["token"], t["token_type"], t["morph_pos"]) | |
| """ | |
| def __init__(self) -> None: | |
| self._engine = TokenizationEngine() | |
| # ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def __call__(self, text: str) -> list[dict]: | |
| """Shorthand for ``tokenize(text)``.""" | |
| return self.tokenize(text) | |
| def tokenize(self, text: str) -> list[dict]: | |
| """Tokenize a single text string. | |
| Returns a list of token dicts, each containing at minimum: | |
| ``token``, ``token_type``, ``morph_pos``, plus optional | |
| ``_*`` metadata fields. | |
| """ | |
| return self._engine.tokenize(text) | |
| def batch_tokenize( | |
| self, | |
| texts: list[str], | |
| workers: int | None = None, | |
| chunk_size: int = 64, | |
| ) -> list[list[dict]]: | |
| """Tokenize a list of texts in parallel. | |
| Args: | |
| texts: List of strings to tokenize. | |
| workers: Number of worker processes (``None`` = all CPUs). | |
| chunk_size: Below this count, run sequentially. | |
| Returns: | |
| List of token lists, in the same order as *texts*. | |
| """ | |
| if not texts: | |
| return [] | |
| n = workers or os.cpu_count() or 4 | |
| if len(texts) <= chunk_size or n == 1: | |
| return [self.tokenize(t) for t in texts] | |
| results: list[list[dict] | None] = [None] * len(texts) | |
| with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool: | |
| futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)} | |
| for fut in as_completed(futs): | |
| i = futs[fut] | |
| try: | |
| results[i] = fut.result() | |
| except Exception as exc: | |
| # Fallback: tokenize in the main process | |
| results[i] = self.tokenize(texts[i]) | |
| print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}") | |
| return results # type: ignore[return-value] | |
| # ββ Statistics βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def stats(self, tokens: list[dict]) -> dict: | |
| """Compute morphological coverage statistics for a token list.""" | |
| total = len(tokens) | |
| if total == 0: | |
| return {k: 0 for k in ( | |
| "total", "roots", "suffixes", "foreign", | |
| "punct", "special", "tr_pct", "pure_pct", | |
| )} | |
| roots = sum(1 for t in tokens if t["token_type"] == "ROOT") | |
| suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX") | |
| foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN") | |
| punct = sum(1 for t in tokens if t["token_type"] == "PUNCT") | |
| special = sum(1 for t in tokens if t["token_type"] in SPECIAL_TYPES) | |
| tr = roots + suffixes + foreign + punct + special | |
| pure = sum( | |
| 1 for t in tokens | |
| if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN") | |
| and not t["token"].strip().startswith("<") | |
| ) | |
| return { | |
| "total": total, | |
| "roots": roots, | |
| "suffixes": suffixes, | |
| "foreign": foreign, | |
| "punct": punct, | |
| "special": special, | |
| "tr_pct": round(tr / total * 100, 2), | |
| "pure_pct": round(pure / total * 100, 2), | |
| } | |