Ethosoft

Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper

edec8b7 about 2 months ago

5.81 kB

	"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer.

	A zero-dependency Turkish tokenizer that segments text into
	morphologically meaningful tokens using deterministic heuristics,
	a bundled TDK dictionary, and a candidate-based segmentation engine.

	Usage::

	from nedo_turkish_tokenizer import NedoTurkishTokenizer

	tok = NedoTurkishTokenizer()
	tokens = tok.tokenize("İstanbul'da meeting'e katılamadım")
	for t in tokens:
	print(t["token"], t["token_type"], t["morph_pos"])

	Output fields per token:
	token : str — token string (leading space = word-initial)
	token_type : str — ROOT \| SUFFIX \| FOREIGN \| PUNCT \|
	NUM \| DATE \| UNIT \| URL \| MENTION \| HASHTAG \| EMOJI \| ACRONYM
	morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix…
	(+ optional _* metadata fields)
	"""

	from __future__ import annotations

	import os
	import multiprocessing
	from concurrent.futures import ProcessPoolExecutor, as_completed

	from .engine import TokenizationEngine
	from .types import SPECIAL_TYPES


	# ── Parallel worker helpers ──────────────────────────────────────────────────

	_worker_tok: "NedoTurkishTokenizer \| None" = None


	def _init_worker() -> None:
	global _worker_tok
	_worker_tok = NedoTurkishTokenizer()


	def _tokenize_one(text: str) -> list[dict]:
	assert _worker_tok is not None
	return _worker_tok.tokenize(text)


	# ══════════════════════════════════════════════════════════════════════════════


	class NedoTurkishTokenizer:
	"""Self-contained Turkish morphological tokenizer.

	Requires no external dependencies — all tokenization logic,
	dictionaries, and heuristics are bundled within the package.

	Example::

	from nedo_turkish_tokenizer import NedoTurkishTokenizer

	tok = NedoTurkishTokenizer()
	tokens = tok("İstanbul'da meeting'e katılamadım")
	for t in tokens:
	print(t["token"], t["token_type"], t["morph_pos"])
	"""

	def __init__(self) -> None:
	self._engine = TokenizationEngine()

	# ── Public API ─────────────────────────────────────────────────────────

	def __call__(self, text: str) -> list[dict]:
	"""Shorthand for ``tokenize(text)``."""
	return self.tokenize(text)

	def tokenize(self, text: str) -> list[dict]:
	"""Tokenize a single text string.

	Returns a list of token dicts, each containing at minimum:
	``token``, ``token_type``, ``morph_pos``, plus optional
	``_*`` metadata fields.
	"""
	return self._engine.tokenize(text)

	def batch_tokenize(
	self,
	texts: list[str],
	workers: int \| None = None,
	chunk_size: int = 64,
	) -> list[list[dict]]:
	"""Tokenize a list of texts in parallel.

	Args:
	texts: List of strings to tokenize.
	workers: Number of worker processes (``None`` = all CPUs).
	chunk_size: Below this count, run sequentially.

	Returns:
	List of token lists, in the same order as texts.
	"""
	if not texts:
	return []

	n = workers or os.cpu_count() or 4

	if len(texts) <= chunk_size or n == 1:
	return [self.tokenize(t) for t in texts]

	results: list[list[dict] \| None] = [None] * len(texts)

	with ProcessPoolExecutor(max_workers=n, initializer=_init_worker) as pool:
	futs = {pool.submit(_tokenize_one, t): i for i, t in enumerate(texts)}
	for fut in as_completed(futs):
	i = futs[fut]
	try:
	results[i] = fut.result()
	except Exception as exc:
	# Fallback: tokenize in the main process
	results[i] = self.tokenize(texts[i])
	print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")

	return results # type: ignore[return-value]

	# ── Statistics ─────────────────────────────────────────────────────────

	def stats(self, tokens: list[dict]) -> dict:
	"""Compute morphological coverage statistics for a token list."""
	total = len(tokens)
	if total == 0:
	return {k: 0 for k in (
	"total", "roots", "suffixes", "foreign",
	"punct", "special", "tr_pct", "pure_pct",
	)}
	roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
	suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
	foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
	punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
	special = sum(1 for t in tokens if t["token_type"] in SPECIAL_TYPES)
	tr = roots + suffixes + foreign + punct + special
	pure = sum(
	1 for t in tokens
	if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
	and not t["token"].strip().startswith("<")
	)
	return {
	"total": total,
	"roots": roots,
	"suffixes": suffixes,
	"foreign": foreign,
	"punct": punct,
	"special": special,
	"tr_pct": round(tr / total * 100, 2),
	"pure_pct": round(pure / total * 100, 2),
	}