Spaces:

gsaltintas
/

tokenizer-comparison

Running

Gül Sena Altıntaş

Further improvements

ce07484 3 months ago

34.5 kB

	from collections import Counter
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go

	from utils import (
	clean_token_display,
	get_normalization_methods,
	normalize_text,
	tokenize_w_tekken,
	tokenize_with_byt5,
	tokenize_with_hf,
	tokenize_with_tiktoken,
	)

	TIKTOKENS = [ "gpt-4o", "gpt-2"]
	HF = ["llama-3", "gemma-2", "qwen3", "mbert", "phi-3", "xglm", "bloom", "aya-expanse", "comma", "tokenmonster", "byt5"]
	available_tokenizers = TIKTOKENS + HF + ["tekken", ]
	pre_selected_tokenizers = ["xglm"]
	pre_selected_tokenizers= available_tokenizers
	pre_selected_tokenizers=[]
	OUT_FILE = Path("paper-outs.txt")
	if not OUT_FILE.exists():
	open(OUT_FILE, "w")

	def tokenize(model, text):

	if model in ["gpt-4", "gpt-2", "gpt-4o"]:
	toks = tokenize_with_tiktoken(text, model)
	elif model in ["tekken"]:
	toks = tokenize_w_tekken(text, model)
	elif "byt5" in model:
	toks = tokenize_with_byt5(text, model)
	else:
	toks = tokenize_with_hf(text, model)
	with open(OUT_FILE, "a", encoding="utf-8") as file: # Specify UTF-8 encoding
	file.write(toks["model"]+"\n")
	file.write(f"Text: {text}\n")
	s= str(",".join([str(t["text"]) for t in toks["tokens"]])) +"\n"
	# s = s.encode("utf-8")
	# s = s.encode('latin1').decode('utf-8')
	file.write(s)
	file.write("\n")
	return toks

	def compare_tokenizers(text, selected_models, show_details=False):
	if not text.strip():
	return "Please enter some text to tokenize.", "", "", "", None, None

	results = {}

	for model in selected_models:
	results[model] = tokenize(model, text)
	# Generate outputs
	efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
	results
	)
	detailed_output = generate_detailed_analysis(results) if show_details else ""
	efficiency_chart = create_efficiency_chart(results)
	token_distribution_chart = create_token_distribution_chart(results)

	return (
	efficiency_output,
	tokenization_html,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	token_distribution_chart,
	)


	def generate_basic_comparison(results):
	if not results:
	return "No results to display.", "", ""

	# Efficiency ranking
	sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

	ranking_output = []
	ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
	for i, (model, result) in enumerate(sorted_models):
	if "error" in result:
	ranking_output.append(
	f"{i + 1}. {result['model']}: ❌ Error - {result['error']}"
	)
	else:
	ranking_output.append(
	f"{i + 1}. {result['model']}: {result['token_count']} tokens "
	f"({result['compression_ratio']:.2f}x compression)"
	)

	# Generate interactive tokenization display
	tokenization_html = generate_interactive_tokenization(results)

	# Generate token ID tables
	token_ids_display = generate_token_ids_display(results)

	return "\n".join(ranking_output), tokenization_html, token_ids_display


	def generate_interactive_tokenization(results):
	##todo main vis
	"""Generate HTML with working hover highlighting across tokenizers"""
	if not results:
	return "<p>No tokenization results to display.</p>"

	html_parts = []

	# Add styles first
	html_parts.append("""
	<div id="tokenizer-container" class="tokenizer-container">
	<style>
	.tokenizer-container {
	display: flex;
	flex-wrap: wrap;
	justify-content: space-between;
	gap: 20px;
	}
	.tokenizer-section {
	margin-bottom: 20px;
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 15px;
	background: white;
	flex-wrap: wrap;
	display: inline-block;
	justify-content: space-between;
	}
	.tokenizer-header {
	font-weight: bold;
	font-size: 18px;
	margin-bottom: 10px;
	color: #2c3e50;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	line-height: 1.8;
	word-wrap: break-word;
	}
	.token {
	display: inline-block;
	margin: 2px;
	padding: 4px 8px;
	border-radius: 4px;
	border: 1px solid;
	cursor: pointer;
	transition: all 0.2s ease;
	position: relative;
	font-size: 14px;
	user-select: none;
	}
	.token:hover {
	transform: scale(1.05);
	z-index: 10;
	box-shadow: 0 2px 8px rgba(0,0,0,0.2);
	}
	.token.highlighted {
	background: #ff6b6b !important;
	border-color: #e55353 !important;
	color: white !important;
	box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
	transform: scale(1.1) !important;
	z-index: 100 !important;
	}
	.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
	.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
	.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
	.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
	.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
	.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
	.token-subword {
	background: #fff8e1 !important;
	border-color: #ffc107 !important;
	border-style: dashed !important;
	}
	.token-stats {
	display: inline-block;
	margin-left: 10px;
	padding: 2px 6px;
	background: #f8f9fa;
	border-radius: 3px;
	font-size: 12px;
	color: #666;
	}
	.highlight-info {
	position: fixed;
	top: 10px;
	right: 10px;
	background: #333;
	color: white;
	padding: 8px 12px;
	border-radius: 4px;
	font-size: 12px;
	display: none;
	z-index: 1000;
	flex-wrap: wrap;
	display: inline-block;
	justify-content: space-between;
	}

	/* Multi-token span styles */
	.token-span-container {
	display: inline-flex;
	margin: 2px;
	}

	.token-multi-span {
	background: linear-gradient(45deg, #e8f5e8 25%, #f3e5f5 25%, #f3e5f5 50%, #e8f5e8 50%, #e8f5e8 75%, #f3e5f5 75%);
	background-size: 8px 8px;
	}

	.token-span-part {
	margin: 0 !important;
	border-radius: 0 !important;
	border-right: none !important;
	position: relative;
	min-width: 20px;
	text-align: center;
	font-size: 11px;
	}


	/* Hover effect for multi-token spans */
	.token-span-container:hover .token-span-part {
	transform: scale(1.02);
	box-shadow: 0 2px 8px rgba(0,0,0,0.15);
	}

	/* Different visual for multi-token spans */
	.token-multi-span.token-word {
	background: repeating-linear-gradient(45deg, #e8f5e8, #e8f5e8 4px, #d4edda 4px, #d4edda 8px);
	}
	.token-multi-span.token-number {
	background: repeating-linear-gradient(45deg, #f3e5f5, #f3e5f5 4px, #e1bee7 4px, #e1bee7 8px);
	}
	.token-multi-span.token-punctuation {
	background: repeating-linear-gradient(45deg, #ffebee, #ffebee 4px, #ffcdd2 4px, #ffcdd2 8px);
	}
	/* Multi-token span styles */
	.token-span-container {
	display: inline-flex;
	margin: 2px;
	cursor: pointer;
	}

	.token-multi-span {
	/* Distinctive background pattern for multi-token spans */
	background: repeating-linear-gradient(
	45deg,
	transparent,
	transparent 2px,
	rgba(0,0,0,0.1) 2px,
	rgba(0,0,0,0.1) 4px
	);
	}

	.token-span-part {
	margin: 0 !important;
	border-radius: 0 !important;
	border-right: none !important;
	position: relative;
	padding: 4px 6px;
	border: 1px dashed rgba(0,0,0,0.3) !important;
	pointer-events: none; /* Prevent individual box clicks */
	}

	.token-span-first {
	border-radius: 4px 0 0 4px !important;
	}

	.token-span-last {
	border-radius: 0 4px 4px 0 !important;
	border-right: 1px solid !important;
	}

	/* Connecting lines between boxes */
	.token-span-part:not(.token-span-last)::after {
	content: '';
	position: absolute;
	top: 0;
	right: -1px;
	width: 1px;
	height: 100%;
	background: rgba(0,0,0,0.3);
	z-index: 1;
	}

	/* Hover effect for entire multi-token span */
	.token-span-container:hover .token-span-part {
	transform: scale(1.05);
	box-shadow: 0 2px 8px rgba(0,0,0,0.2);
	}

	.token-span-container.highlighted .token-span-part {
	background: #ff6b6b !important;
	border-color: #e55353 !important;
	color: white !important;
	box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
	transform: scale(1.1) !important;
	z-index: 100 !important;
	}

	/* Different patterns for different token types when multi-span */
	.token-multi-span.token-word .token-span-part {
	background: #e8f5e8;
	border-color: #4caf50;
	color: #2e7d32;
	}
	.token-multi-span.token-number .token-span-part {
	background: #f3e5f5;
	border-color: #9c27b0;
	color: #7b1fa2;
	}
	.token-multi-span.token-punctuation .token-span-part {
	background: #ffebee;
	border-color: #f44336;
	color: #c62828;
	}
	</style>

	<div class="highlight-info" id="highlight-info"></div>

	<script>
	function highlightTokens(targetText) {
	// Clear all highlights
	document.querySelectorAll('.token').forEach(function(token) {
	token.classList.remove('highlighted');
	});

	// Highlight matching tokens
	let count = 0;
	document.querySelectorAll('.token').forEach(function(token) {
	if (token.getAttribute('data-text') === targetText) {
	token.classList.add('highlighted');
	count++;
	}
	});

	// Show info
	const info = document.getElementById('highlight-info');
	if (info) {
	const displayText = targetText === ' ' ? '(space)' : targetText;
	info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
	info.style.display = 'block';
	}
	}

	function clearHighlights() {
	document.querySelectorAll('.token').forEach(function(token) {
	token.classList.remove('highlighted');
	});
	const info = document.getElementById('highlight-info');
	if (info) {
	info.style.display = 'none';
	}
	}

	function highlightTokens(targetText) {
	// Clear all highlights
	document.querySelectorAll('.token, .token-span-container').forEach(function(element) {
	element.classList.remove('highlighted');
	});

	// Highlight matching tokens and spans
	let count = 0;

	// Single tokens
	document.querySelectorAll('.token').forEach(function(token) {
	if (token.getAttribute('data-text') === targetText) {
	token.classList.add('highlighted');
	count++;
	}
	});

	// Multi-token spans
	document.querySelectorAll('.token-span-container').forEach(function(span) {
	if (span.getAttribute('data-text') === targetText) {
	span.classList.add('highlighted');
	count++;
	}
	});

	// Show info
	const info = document.getElementById('highlight-info');
	if (info) {
	const displayText = targetText === ' ' ? '(space)' : targetText;
	info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
	info.style.display = 'block';
	}
	}
	</script>
	""")

	# Generate tokenizer sections with inline event handlers
	for model, result in results.items():
	if "error" in result:
	html_parts.append(f"""
	<div class="tokenizer-section">
	<div class="tokenizer-header">{result["model"]} ❌</div>
	<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
	</div>
	""")
	continue

	html_parts.append(f"""
	<div class="tokenizer-section">
	<div class="tokenizer-header">
	{result["model"]}
	<span class="token-stats">
	{result["token_count"]} tokens \|
	{result["encoding"]} \|
	{result["compression_ratio"]:.2f}x compression
	</span>
	</div>
	<div class="token-display">
	""")

	# Add tokens with inline event handlers
	subword_count = 0
	for i, token in enumerate(result["tokens"]):
	token_text = token["text"]
	token_text = clean_token_display(token_text)
	display_text = token_text if token_text.strip() else "·"
	if token_text == "<newline>":
	html_parts.append("<br>")
	continue
	# Check if this token spans multiple token IDs
	token_ids = token["id"] if isinstance(token["id"], list) else [token["id"]]
	is_multi_token = len(token_ids) > 1

	# Determine token class
	token_class = f"token token-{token['type']}"
	if token["is_subword"]:
	token_class += " token-subword"
	subword_count += 1

	# Create unique identifier for this token occurrence
	token_id = f"token_{model}_{i}"

	# Escape text for HTML and JavaScript - be very careful with quotes
	escaped_text = (
	token_text.replace("\\", "\\\\")
	.replace("'", "\\'")
	.replace('"', '\\"')
	.replace("\r", "\\r")
	.replace("\n", "\\n")
	)

	escaped_display = (
	display_text.replace('"', """)
	.replace("'", "'")
	.replace("\r", "\n")
	)

	if is_multi_token:
	# Create a container for the multi-token span
	span_id = f"span_{model}_{i}"
	token_ids_str = ",".join(map(str, token_ids))

	html_parts.append(f"""<span class="token-span-container"
	id="{span_id}_container"
	data-text="{token_text.replace('"', """).replace("'", "'")}"
	data-ids="{token_ids_str}"
	data-position="{i}"
	data-model="{model}"
	onmouseover="highlightTokens('{escaped_text}')"
	onmouseout="clearHighlights()"
	onclick="alert('Token: \\'{escaped_text}\\'\\nIDs: [{token_ids_str}]\\nModel: {model}\\nSpans {len(token_ids)} token IDs')"
	title="Text: '{token_text}' \| IDs: [{token_ids_str}] \| Type: {token["type"]} \| Subword: {token["is_subword"]}">""")

	# Create individual boxes for each token ID - but they act as one unit
	for j, tid in enumerate(token_ids):
	token_id = f"token_{model}_{i}_{j}"
	box_class = f"{token_class} token-span-part"
	box_content = ""

	# Add position indicators for styling
	if j == 0:
	box_class += " token-span-first"
	box_content = escaped_display
	elif j == len(token_ids) - 1:
	box_class += " token-span-last"
	else:
	box_class += " token-span-middle"

	# Each box shows the same text (the combined character/text)
	html_parts.append(f"""<span class="{box_class}"
	id="{token_id}"
	data-token-id="{tid}">{box_content}</span>""")

	html_parts.append("</span>")
	else:
	# Single token - original behavior
	token_id = f"token_{model}_{i}"
	html_parts.append(f"""<span class="{token_class}"
	id="{token_id}"
	data-text="{token_text.replace('"', """).replace("'", "'")}"
	data-id="{token_ids[0]}"
	data-position="{i}"
	data-model="{model}"
	title="Text: '{token_text}' \| ID: {token_ids[0]} \| Type: {token["type"]} \| Subword: {token["is_subword"]}"
	onmouseover="highlightTokens('{escaped_text}')"
	onmouseout="clearHighlights()"
	onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token_ids[0]}\\nModel: {model}')">{escaped_display}</span>""")
	# # Use inline event handlers that work in Gradio
	# html_parts.append(f"""<span class="{token_class}"
	# id="{token_id}"
	# data-text="{token_text.replace('"', """).replace("'", "'")}"
	# data-id="{token["id"]}"
	# data-position="{i}"
	# data-model="{model}"
	# title="Text: '{token_text}' \| ID: {token["id"]} \| Type: {token["type"]} \| Subword: {token["is_subword"]}"
	# onmouseover="highlightTokens('{escaped_text}')"
	# onmouseout="clearHighlights()"
	# onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")

	html_parts.append(f"""
	</div>
	<div style="margin-top: 8px; font-size: 12px; color: #666;">
	Subwords: {subword_count}/{sum([len(t) for t in result["tokens"]])}
	({subword_count / len(result["tokens"]) * 100:.1f}%)
	</div>
	</div>
	""")

	html_parts.append("</div>")
	return "".join(html_parts)


	def generate_token_ids_display(results):
	"""Generate a clean display of token IDs for each tokenizer"""
	if not results:
	return "No token IDs to display."

	output = []
	output.append("## 🔢 Token IDs by Tokenizer")

	for model, result in results.items():
	if "error" in result:
	output.append(f"\n### {result['model']} ❌")
	output.append(f"Error: {result['error']}")
	continue

	output.append(f"\n### {result['model']}")
	output.append(
	f"Vocab Size: {result['vocab_size']:,} \| Encoding: {result['encoding']}"
	)

	# Display token IDs in a readable format
	token_ids = [str(token["id"]) for token in result["tokens"]]

	# Group IDs for better readability (10 per line)
	lines = []
	for i in range(0, len(token_ids), 10):
	line_ids = token_ids[i : i + 10]
	lines.append(" ".join(line_ids))

	output.append("```")
	output.append("\n".join(lines))
	output.append("```")

	# Add some statistics
	unique_ids = len(set(token_ids))
	output.append(
	f"Stats: {len(token_ids)} total tokens, {unique_ids} unique IDs"
	)

	return "\n".join(output)


	def compare_with_normalization(
	text, selected_models, normalization_method, show_details=False
	):
	"""Compare tokenizers with optional normalization"""
	normalized_text = normalize_text(text, normalization_method)
	print(
	"[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
	)

	# Get both original and normalized results
	original_results = {}
	normalized_results = {}

	for model in selected_models:
	original_results[model] = tokenize(model, text)
	if normalization_method != "none":
	normalized_results[model] = tokenize(model, text)
	return original_results, normalized_results, normalized_text


	def generate_detailed_analysis(results):
	if not results or len(results) < 2:
	return "Need at least 2 tokenizers for detailed analysis."

	output = []
	output.append("## 🔍 Detailed Analysis")

	# Find common tokens
	all_token_sets = []
	for model, result in results.items():
	if "error" not in result:
	token_texts = {token["text"] for token in result["tokens"]}
	all_token_sets.append(token_texts)

	if all_token_sets:
	common_tokens = set.intersection(*all_token_sets)
	output.append(f"\n### Common Tokens ({len(common_tokens)})")
	if common_tokens:
	common_display = [
	f"`{token}`" if token != " " else "`·`"
	for token in list(common_tokens)[:15]
	]
	output.append(" ".join(common_display))
	else:
	output.append("No common tokens found.")

	# Token type distribution
	output.append("\n### Token Type Distribution")
	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
	output.append(f"{result['model']}: {', '.join(type_display)}")

	# Subword analysis
	output.append("\n### Subword Analysis")
	for model, result in results.items():
	if "error" not in result:
	subwords = [token for token in result["tokens"] if token["is_subword"]]
	subword_ratio = (
	len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
	)
	output.append(
	f"{result['model']}: {len(subwords)} subwords ({subword_ratio:.1f}%)"
	)

	return "\n".join(output)


	def create_efficiency_chart(results):
	if not results:
	return None

	models = []
	token_counts = []
	compression_ratios = []

	for model, result in results.items():
	if "error" not in result:
	models.append(result["model"])
	token_counts.append(result["token_count"])
	compression_ratios.append(result["compression_ratio"])

	if not models:
	return None

	fig = go.Figure()

	# Add token count bars
	fig.add_trace(
	go.Bar(
	x=models,
	y=token_counts,
	name="Token Count",
	marker_color="lightblue",
	text=token_counts,
	textposition="auto",
	)
	)

	fig.update_layout(
	title="Token Count Comparison (Lower = More Efficient)",
	xaxis_title="Tokenizer",
	yaxis_title="Number of Tokens",
	template="plotly_white",
	)

	return fig


	def create_token_distribution_chart(results):
	if not results:
	return None

	all_data = []

	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	for token_type, count in type_counts.items():
	all_data.append(
	{
	"Tokenizer": result["model"],
	"Token Type": token_type,
	"Count": count,
	}
	)

	if not all_data:
	return None

	df = pd.DataFrame(all_data)

	fig = px.bar(
	df,
	x="Tokenizer",
	y="Count",
	color="Token Type",
	title="Token Type Distribution by Tokenizer",
	template="plotly_white",
	)

	return fig


	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	background: #f8f9fa;
	padding: 8px;
	border-radius: 4px;
	font-size: 0.9em;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(
	title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
	) as demo:
	gr.Markdown("""
	# 🔤 Advanced Tokenizer Comparison Tool

	Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.

	Legend: 🔤 Word \| 🔢 Number \| ❗ Punctuation \| 🔸 Subword \| · Space

	💡 Try the sample texts to see how tokenizers handle different challenges like:
	- Mixed languages and scripts
	- Programming code and JSON
	- Long compound words
	- Special characters and emojis
	- Technical terminology
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Sample texts dropdown
	pre_choices = [
	"Custom text (enter below)",
	"""
	ᴾʸᵗʰᵒⁿ
	ₚᵧₜₕₒₙ
	P̲y̲t̲h̲o̲n̲
	P̄ȳt̄h̄ōn̄
	P̅y̅t̅h̅o̅n̅
	ⓅⓎⓉⒽⓄⓃ
	⒫⒴⒯⒣⒪⒩
	🄿🅈🅃🄷🄾🄽
	ⓅⓎⓉⒽⓄⓃ
	Ｐｙｔｈｏｎ
	Pʎʇɥou
	Pyʇɥou
	P̊ẙt̊h̊o̊n̊
	Pëthøñ
	P̶y̶t̶h̶o̶n̶
	P̸y̸t̸h̸o̸n̸
	P̷y̷t̷h̷o̷n̷
	P̴y̴t̴h̴o̴n̴
	𝒫𝓎𝓉𝒽𝑜𝓃
	ℙ𝕪𝕥𝕙𝕠𝕟
	""",
	"english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
	"french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
	"german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
	"turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
	"chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
	"arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
	"hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
	"code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
	"mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
	"numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
	"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
	"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
	"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
	"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
	"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
	"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
	"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
	'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
	"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
	]
	sample_texts = gr.Dropdown(
	choices=pre_choices,
	value="Custom text (enter below)",
	label="Choose a sample text or enter your own",
	interactive=True,
	)

	text_input = gr.Textbox(
	label="Text to tokenize",
	placeholder="Enter your text here or select a sample above...",
	lines=4,
	value=pre_choices[1],
	)
	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.TabItem("Models"):
	model_selector = gr.CheckboxGroup(

	choices=available_tokenizers,
	value=pre_selected_tokenizers,
	label="Select tokenizers to compare...",
	)
	show_details = gr.Checkbox(
	label="Show detailed analysis", value=False
	)

	with gr.TabItem("Normalization"):
	normalization_method = gr.Dropdown(
	choices=[method[0] for method in get_normalization_methods()],
	value="none",
	label="Normalization Method",
	)
	show_normalization = gr.Checkbox(
	label="Show normalized results", value=False
	)
	with gr.Row():
	with gr.Column():
	efficiency_output = gr.Markdown(
	label="Efficiency Ranking",
	value="Enter text above to see efficiency comparison...",
	)

	with gr.Row():
	with gr.Column():
	tokenization_display = gr.HTML(
	label="Interactive Tokenization (Hover to highlight across tokenizers)",
	value="<p>Enter text above to see interactive tokenization...</p>",
	)
	with gr.Row():
	with gr.Column():
	normalized_display = gr.HTML(
	label="Normalized Tokenization",
	value="<p>Enable normalization to see results...</p>",
	visible=False,
	)
	with gr.Row():
	with gr.Column():
	token_ids_output = gr.Markdown(
	label="Token IDs", value="Token IDs will appear here..."
	)

	with gr.Row():
	with gr.Column():
	detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

	with gr.Row():
	with gr.Column():
	efficiency_chart = gr.Plot(label="Efficiency Comparison")
	with gr.Column():
	distribution_chart = gr.Plot(label="Token Type Distribution")

	# Function to update text input when sample is selected
	def update_text_from_sample(sample_choice):
	if sample_choice == "Custom text (enter below)":
	return gr.update() # Don't change the text input
	else:
	# Extract the text after the colon
	sample_text = (
	sample_choice.split(": ", 1)[1]
	if ": " in sample_choice
	else sample_choice
	)
	return gr.update(value=sample_text)

	# Update text input when sample is selected
	sample_texts.change(
	fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
	)

	# Main comparison function
	def update_comparison_with_norm(text, models, details, norm_method, show_norm):
	if normalization_method == "none" or not show_norm:
	# Original behavior
	(
	efficiency,
	tokenization_html,
	token_ids,
	detailed,
	eff_chart,
	dist_chart,
	) = compare_tokenizers(text, models, details)
	return (
	efficiency,
	tokenization_html,
	token_ids,
	detailed,
	eff_chart,
	dist_chart,
	)
	else:
	# With normalization
	original_results, normalized_results, normalized_text = (
	compare_with_normalization(text, models, norm_method, details)
	)

	# Generate displays for both
	orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
	norm_eff, norm_html, norm_ids = generate_basic_comparison(
	normalized_results
	)

	# Combine or show separately
	combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}"

	return (
	orig_eff,
	gr.update(value=combined_html, visible=True),
	orig_ids,
	"",
	None,
	None,
	)

	def update_comparison(text, models, details):
	efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
	compare_tokenizers(text, models, details)
	)
	return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart

	# Auto-update on changes
	for component in [
	text_input,
	model_selector,
	show_details,
	normalization_method,
	show_normalization,
	]:
	component.change(
	fn=update_comparison_with_norm,
	inputs=[
	text_input,
	model_selector,
	show_details,
	normalization_method,
	show_normalization,
	],
	outputs=[
	efficiency_output,
	tokenization_display,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	distribution_chart,
	],
	)

	gr.Markdown("""
	---
	### About the Models

	- GPT-4/GPT-2: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
	- LLaMA-2/3: Meta's models using SentencePiece (Llama-3 uses BPE)
	- Gemma-2: Google's model with SentencePiece (though HuggingFace uses BPE)
	- Qwen3/2.5: Alibaba's models with BPE
	- BERT/DistilBERT: Google's models with WordPiece
	- BLOOM: BigScience's multilingual model with BPE
	- Aya Expanse: Cohere's multilingual model with SentencePiece
	- Comma (Common Pile): Common Pile's model with BPE
	- Byt5: Google's byte-level model

	### Features
	- Efficiency Ranking: Compare token counts across models
	- Subword Analysis: See how models handle subwords
	- Token Types: Classification of word/number/punctuation tokens
	- Visual Charts: Interactive plots for comparison
	- Detailed Analysis: Common tokens and distribution stats
	""")

	if __name__ == "__main__":
	demo.launch()