Spaces:
Running
Running
| from collections import Counter | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from utils import ( | |
| clean_token_display, | |
| get_normalization_methods, | |
| normalize_text, | |
| tokenize_w_tekken, | |
| tokenize_with_byt5, | |
| tokenize_with_hf, | |
| tokenize_with_tiktoken, | |
| ) | |
| TIKTOKENS = [ "gpt-4o", "gpt-2"] | |
| HF = ["llama-3", "gemma-2", "qwen3", "mbert", "phi-3", "xglm", "bloom", "aya-expanse", "comma", "tokenmonster", "byt5"] | |
| available_tokenizers = TIKTOKENS + HF + ["tekken", ] | |
| pre_selected_tokenizers = ["xglm"] | |
| pre_selected_tokenizers= available_tokenizers | |
| pre_selected_tokenizers=[] | |
| OUT_FILE = Path("paper-outs.txt") | |
| if not OUT_FILE.exists(): | |
| open(OUT_FILE, "w") | |
| def tokenize(model, text): | |
| if model in ["gpt-4", "gpt-2", "gpt-4o"]: | |
| toks = tokenize_with_tiktoken(text, model) | |
| elif model in ["tekken"]: | |
| toks = tokenize_w_tekken(text, model) | |
| elif "byt5" in model: | |
| toks = tokenize_with_byt5(text, model) | |
| else: | |
| toks = tokenize_with_hf(text, model) | |
| with open(OUT_FILE, "a", encoding="utf-8") as file: # Specify UTF-8 encoding | |
| file.write(toks["model"]+"\n") | |
| file.write(f"Text: {text}\n") | |
| s= str(",".join([str(t["text"]) for t in toks["tokens"]])) +"\n" | |
| # s = s.encode("utf-8") | |
| # s = s.encode('latin1').decode('utf-8') | |
| file.write(s) | |
| file.write("\n") | |
| return toks | |
| def compare_tokenizers(text, selected_models, show_details=False): | |
| if not text.strip(): | |
| return "Please enter some text to tokenize.", "", "", "", None, None | |
| results = {} | |
| for model in selected_models: | |
| results[model] = tokenize(model, text) | |
| # Generate outputs | |
| efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison( | |
| results | |
| ) | |
| detailed_output = generate_detailed_analysis(results) if show_details else "" | |
| efficiency_chart = create_efficiency_chart(results) | |
| token_distribution_chart = create_token_distribution_chart(results) | |
| return ( | |
| efficiency_output, | |
| tokenization_html, | |
| token_ids_output, | |
| detailed_output, | |
| efficiency_chart, | |
| token_distribution_chart, | |
| ) | |
| def generate_basic_comparison(results): | |
| if not results: | |
| return "No results to display.", "", "" | |
| # Efficiency ranking | |
| sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"]) | |
| ranking_output = [] | |
| ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)") | |
| for i, (model, result) in enumerate(sorted_models): | |
| if "error" in result: | |
| ranking_output.append( | |
| f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}" | |
| ) | |
| else: | |
| ranking_output.append( | |
| f"{i + 1}. **{result['model']}**: {result['token_count']} tokens " | |
| f"({result['compression_ratio']:.2f}x compression)" | |
| ) | |
| # Generate interactive tokenization display | |
| tokenization_html = generate_interactive_tokenization(results) | |
| # Generate token ID tables | |
| token_ids_display = generate_token_ids_display(results) | |
| return "\n".join(ranking_output), tokenization_html, token_ids_display | |
| def generate_interactive_tokenization(results): | |
| ##todo main vis | |
| """Generate HTML with working hover highlighting across tokenizers""" | |
| if not results: | |
| return "<p>No tokenization results to display.</p>" | |
| html_parts = [] | |
| # Add styles first | |
| html_parts.append(""" | |
| <div id="tokenizer-container" class="tokenizer-container"> | |
| <style> | |
| .tokenizer-container { | |
| display: flex; | |
| flex-wrap: wrap; | |
| justify-content: space-between; | |
| gap: 20px; | |
| } | |
| .tokenizer-section { | |
| margin-bottom: 20px; | |
| border: 1px solid #e0e0e0; | |
| border-radius: 8px; | |
| padding: 15px; | |
| background: white; | |
| flex-wrap: wrap; | |
| display: inline-block; | |
| justify-content: space-between; | |
| } | |
| .tokenizer-header { | |
| font-weight: bold; | |
| font-size: 18px; | |
| margin-bottom: 10px; | |
| color: #2c3e50; | |
| } | |
| .token-display { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| line-height: 1.8; | |
| word-wrap: break-word; | |
| } | |
| .token { | |
| display: inline-block; | |
| margin: 2px; | |
| padding: 4px 8px; | |
| border-radius: 4px; | |
| border: 1px solid; | |
| cursor: pointer; | |
| transition: all 0.2s ease; | |
| position: relative; | |
| font-size: 14px; | |
| user-select: none; | |
| } | |
| .token:hover { | |
| transform: scale(1.05); | |
| z-index: 10; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.2); | |
| } | |
| .token.highlighted { | |
| background: #ff6b6b !important; | |
| border-color: #e55353 !important; | |
| color: white !important; | |
| box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important; | |
| transform: scale(1.1) !important; | |
| z-index: 100 !important; | |
| } | |
| .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; } | |
| .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; } | |
| .token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; } | |
| .token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; } | |
| .token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; } | |
| .token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; } | |
| .token-subword { | |
| background: #fff8e1 !important; | |
| border-color: #ffc107 !important; | |
| border-style: dashed !important; | |
| } | |
| .token-stats { | |
| display: inline-block; | |
| margin-left: 10px; | |
| padding: 2px 6px; | |
| background: #f8f9fa; | |
| border-radius: 3px; | |
| font-size: 12px; | |
| color: #666; | |
| } | |
| .highlight-info { | |
| position: fixed; | |
| top: 10px; | |
| right: 10px; | |
| background: #333; | |
| color: white; | |
| padding: 8px 12px; | |
| border-radius: 4px; | |
| font-size: 12px; | |
| display: none; | |
| z-index: 1000; | |
| flex-wrap: wrap; | |
| display: inline-block; | |
| justify-content: space-between; | |
| } | |
| /* Multi-token span styles */ | |
| .token-span-container { | |
| display: inline-flex; | |
| margin: 2px; | |
| } | |
| .token-multi-span { | |
| background: linear-gradient(45deg, #e8f5e8 25%, #f3e5f5 25%, #f3e5f5 50%, #e8f5e8 50%, #e8f5e8 75%, #f3e5f5 75%); | |
| background-size: 8px 8px; | |
| } | |
| .token-span-part { | |
| margin: 0 !important; | |
| border-radius: 0 !important; | |
| border-right: none !important; | |
| position: relative; | |
| min-width: 20px; | |
| text-align: center; | |
| font-size: 11px; | |
| } | |
| /* Hover effect for multi-token spans */ | |
| .token-span-container:hover .token-span-part { | |
| transform: scale(1.02); | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.15); | |
| } | |
| /* Different visual for multi-token spans */ | |
| .token-multi-span.token-word { | |
| background: repeating-linear-gradient(45deg, #e8f5e8, #e8f5e8 4px, #d4edda 4px, #d4edda 8px); | |
| } | |
| .token-multi-span.token-number { | |
| background: repeating-linear-gradient(45deg, #f3e5f5, #f3e5f5 4px, #e1bee7 4px, #e1bee7 8px); | |
| } | |
| .token-multi-span.token-punctuation { | |
| background: repeating-linear-gradient(45deg, #ffebee, #ffebee 4px, #ffcdd2 4px, #ffcdd2 8px); | |
| } | |
| /* Multi-token span styles */ | |
| .token-span-container { | |
| display: inline-flex; | |
| margin: 2px; | |
| cursor: pointer; | |
| } | |
| .token-multi-span { | |
| /* Distinctive background pattern for multi-token spans */ | |
| background: repeating-linear-gradient( | |
| 45deg, | |
| transparent, | |
| transparent 2px, | |
| rgba(0,0,0,0.1) 2px, | |
| rgba(0,0,0,0.1) 4px | |
| ); | |
| } | |
| .token-span-part { | |
| margin: 0 !important; | |
| border-radius: 0 !important; | |
| border-right: none !important; | |
| position: relative; | |
| padding: 4px 6px; | |
| border: 1px dashed rgba(0,0,0,0.3) !important; | |
| pointer-events: none; /* Prevent individual box clicks */ | |
| } | |
| .token-span-first { | |
| border-radius: 4px 0 0 4px !important; | |
| } | |
| .token-span-last { | |
| border-radius: 0 4px 4px 0 !important; | |
| border-right: 1px solid !important; | |
| } | |
| /* Connecting lines between boxes */ | |
| .token-span-part:not(.token-span-last)::after { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| right: -1px; | |
| width: 1px; | |
| height: 100%; | |
| background: rgba(0,0,0,0.3); | |
| z-index: 1; | |
| } | |
| /* Hover effect for entire multi-token span */ | |
| .token-span-container:hover .token-span-part { | |
| transform: scale(1.05); | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.2); | |
| } | |
| .token-span-container.highlighted .token-span-part { | |
| background: #ff6b6b !important; | |
| border-color: #e55353 !important; | |
| color: white !important; | |
| box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important; | |
| transform: scale(1.1) !important; | |
| z-index: 100 !important; | |
| } | |
| /* Different patterns for different token types when multi-span */ | |
| .token-multi-span.token-word .token-span-part { | |
| background: #e8f5e8; | |
| border-color: #4caf50; | |
| color: #2e7d32; | |
| } | |
| .token-multi-span.token-number .token-span-part { | |
| background: #f3e5f5; | |
| border-color: #9c27b0; | |
| color: #7b1fa2; | |
| } | |
| .token-multi-span.token-punctuation .token-span-part { | |
| background: #ffebee; | |
| border-color: #f44336; | |
| color: #c62828; | |
| } | |
| </style> | |
| <div class="highlight-info" id="highlight-info"></div> | |
| <script> | |
| function highlightTokens(targetText) { | |
| // Clear all highlights | |
| document.querySelectorAll('.token').forEach(function(token) { | |
| token.classList.remove('highlighted'); | |
| }); | |
| // Highlight matching tokens | |
| let count = 0; | |
| document.querySelectorAll('.token').forEach(function(token) { | |
| if (token.getAttribute('data-text') === targetText) { | |
| token.classList.add('highlighted'); | |
| count++; | |
| } | |
| }); | |
| // Show info | |
| const info = document.getElementById('highlight-info'); | |
| if (info) { | |
| const displayText = targetText === ' ' ? '(space)' : targetText; | |
| info.textContent = '"' + displayText + '" appears in ' + count + ' positions'; | |
| info.style.display = 'block'; | |
| } | |
| } | |
| function clearHighlights() { | |
| document.querySelectorAll('.token').forEach(function(token) { | |
| token.classList.remove('highlighted'); | |
| }); | |
| const info = document.getElementById('highlight-info'); | |
| if (info) { | |
| info.style.display = 'none'; | |
| } | |
| } | |
| function highlightTokens(targetText) { | |
| // Clear all highlights | |
| document.querySelectorAll('.token, .token-span-container').forEach(function(element) { | |
| element.classList.remove('highlighted'); | |
| }); | |
| // Highlight matching tokens and spans | |
| let count = 0; | |
| // Single tokens | |
| document.querySelectorAll('.token').forEach(function(token) { | |
| if (token.getAttribute('data-text') === targetText) { | |
| token.classList.add('highlighted'); | |
| count++; | |
| } | |
| }); | |
| // Multi-token spans | |
| document.querySelectorAll('.token-span-container').forEach(function(span) { | |
| if (span.getAttribute('data-text') === targetText) { | |
| span.classList.add('highlighted'); | |
| count++; | |
| } | |
| }); | |
| // Show info | |
| const info = document.getElementById('highlight-info'); | |
| if (info) { | |
| const displayText = targetText === ' ' ? '(space)' : targetText; | |
| info.textContent = '"' + displayText + '" appears in ' + count + ' positions'; | |
| info.style.display = 'block'; | |
| } | |
| } | |
| </script> | |
| """) | |
| # Generate tokenizer sections with inline event handlers | |
| for model, result in results.items(): | |
| if "error" in result: | |
| html_parts.append(f""" | |
| <div class="tokenizer-section"> | |
| <div class="tokenizer-header">{result["model"]} ❌</div> | |
| <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div> | |
| </div> | |
| """) | |
| continue | |
| html_parts.append(f""" | |
| <div class="tokenizer-section"> | |
| <div class="tokenizer-header"> | |
| {result["model"]} | |
| <span class="token-stats"> | |
| {result["token_count"]} tokens | | |
| {result["encoding"]} | | |
| {result["compression_ratio"]:.2f}x compression | |
| </span> | |
| </div> | |
| <div class="token-display"> | |
| """) | |
| # Add tokens with inline event handlers | |
| subword_count = 0 | |
| for i, token in enumerate(result["tokens"]): | |
| token_text = token["text"] | |
| token_text = clean_token_display(token_text) | |
| display_text = token_text if token_text.strip() else "·" | |
| if token_text == "<newline>": | |
| html_parts.append("<br>") | |
| continue | |
| # Check if this token spans multiple token IDs | |
| token_ids = token["id"] if isinstance(token["id"], list) else [token["id"]] | |
| is_multi_token = len(token_ids) > 1 | |
| # Determine token class | |
| token_class = f"token token-{token['type']}" | |
| if token["is_subword"]: | |
| token_class += " token-subword" | |
| subword_count += 1 | |
| # Create unique identifier for this token occurrence | |
| token_id = f"token_{model}_{i}" | |
| # Escape text for HTML and JavaScript - be very careful with quotes | |
| escaped_text = ( | |
| token_text.replace("\\", "\\\\") | |
| .replace("'", "\\'") | |
| .replace('"', '\\"') | |
| .replace("\r", "\\r") | |
| .replace("\n", "\\n") | |
| ) | |
| escaped_display = ( | |
| display_text.replace('"', """) | |
| .replace("'", "'") | |
| .replace("\r", "\n") | |
| ) | |
| if is_multi_token: | |
| # Create a container for the multi-token span | |
| span_id = f"span_{model}_{i}" | |
| token_ids_str = ",".join(map(str, token_ids)) | |
| html_parts.append(f"""<span class="token-span-container" | |
| id="{span_id}_container" | |
| data-text="{token_text.replace('"', """).replace("'", "'")}" | |
| data-ids="{token_ids_str}" | |
| data-position="{i}" | |
| data-model="{model}" | |
| onmouseover="highlightTokens('{escaped_text}')" | |
| onmouseout="clearHighlights()" | |
| onclick="alert('Token: \\'{escaped_text}\\'\\nIDs: [{token_ids_str}]\\nModel: {model}\\nSpans {len(token_ids)} token IDs')" | |
| title="Text: '{token_text}' | IDs: [{token_ids_str}] | Type: {token["type"]} | Subword: {token["is_subword"]}">""") | |
| # Create individual boxes for each token ID - but they act as one unit | |
| for j, tid in enumerate(token_ids): | |
| token_id = f"token_{model}_{i}_{j}" | |
| box_class = f"{token_class} token-span-part" | |
| box_content = "" | |
| # Add position indicators for styling | |
| if j == 0: | |
| box_class += " token-span-first" | |
| box_content = escaped_display | |
| elif j == len(token_ids) - 1: | |
| box_class += " token-span-last" | |
| else: | |
| box_class += " token-span-middle" | |
| # Each box shows the same text (the combined character/text) | |
| html_parts.append(f"""<span class="{box_class}" | |
| id="{token_id}" | |
| data-token-id="{tid}">{box_content}</span>""") | |
| html_parts.append("</span>") | |
| else: | |
| # Single token - original behavior | |
| token_id = f"token_{model}_{i}" | |
| html_parts.append(f"""<span class="{token_class}" | |
| id="{token_id}" | |
| data-text="{token_text.replace('"', """).replace("'", "'")}" | |
| data-id="{token_ids[0]}" | |
| data-position="{i}" | |
| data-model="{model}" | |
| title="Text: '{token_text}' | ID: {token_ids[0]} | Type: {token["type"]} | Subword: {token["is_subword"]}" | |
| onmouseover="highlightTokens('{escaped_text}')" | |
| onmouseout="clearHighlights()" | |
| onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token_ids[0]}\\nModel: {model}')">{escaped_display}</span>""") | |
| # # Use inline event handlers that work in Gradio | |
| # html_parts.append(f"""<span class="{token_class}" | |
| # id="{token_id}" | |
| # data-text="{token_text.replace('"', """).replace("'", "'")}" | |
| # data-id="{token["id"]}" | |
| # data-position="{i}" | |
| # data-model="{model}" | |
| # title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}" | |
| # onmouseover="highlightTokens('{escaped_text}')" | |
| # onmouseout="clearHighlights()" | |
| # onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""") | |
| html_parts.append(f""" | |
| </div> | |
| <div style="margin-top: 8px; font-size: 12px; color: #666;"> | |
| Subwords: {subword_count}/{sum([len(t) for t in result["tokens"]])} | |
| ({subword_count / len(result["tokens"]) * 100:.1f}%) | |
| </div> | |
| </div> | |
| """) | |
| html_parts.append("</div>") | |
| return "".join(html_parts) | |
| def generate_token_ids_display(results): | |
| """Generate a clean display of token IDs for each tokenizer""" | |
| if not results: | |
| return "No token IDs to display." | |
| output = [] | |
| output.append("## 🔢 Token IDs by Tokenizer") | |
| for model, result in results.items(): | |
| if "error" in result: | |
| output.append(f"\n### {result['model']} ❌") | |
| output.append(f"Error: {result['error']}") | |
| continue | |
| output.append(f"\n### {result['model']}") | |
| output.append( | |
| f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}" | |
| ) | |
| # Display token IDs in a readable format | |
| token_ids = [str(token["id"]) for token in result["tokens"]] | |
| # Group IDs for better readability (10 per line) | |
| lines = [] | |
| for i in range(0, len(token_ids), 10): | |
| line_ids = token_ids[i : i + 10] | |
| lines.append(" ".join(line_ids)) | |
| output.append("```") | |
| output.append("\n".join(lines)) | |
| output.append("```") | |
| # Add some statistics | |
| unique_ids = len(set(token_ids)) | |
| output.append( | |
| f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs" | |
| ) | |
| return "\n".join(output) | |
| def compare_with_normalization( | |
| text, selected_models, normalization_method, show_details=False | |
| ): | |
| """Compare tokenizers with optional normalization""" | |
| normalized_text = normalize_text(text, normalization_method) | |
| print( | |
| "[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text | |
| ) | |
| # Get both original and normalized results | |
| original_results = {} | |
| normalized_results = {} | |
| for model in selected_models: | |
| original_results[model] = tokenize(model, text) | |
| if normalization_method != "none": | |
| normalized_results[model] = tokenize(model, text) | |
| return original_results, normalized_results, normalized_text | |
| def generate_detailed_analysis(results): | |
| if not results or len(results) < 2: | |
| return "Need at least 2 tokenizers for detailed analysis." | |
| output = [] | |
| output.append("## 🔍 Detailed Analysis") | |
| # Find common tokens | |
| all_token_sets = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| token_texts = {token["text"] for token in result["tokens"]} | |
| all_token_sets.append(token_texts) | |
| if all_token_sets: | |
| common_tokens = set.intersection(*all_token_sets) | |
| output.append(f"\n### Common Tokens ({len(common_tokens)})") | |
| if common_tokens: | |
| common_display = [ | |
| f"`{token}`" if token != " " else "`·`" | |
| for token in list(common_tokens)[:15] | |
| ] | |
| output.append(" ".join(common_display)) | |
| else: | |
| output.append("No common tokens found.") | |
| # Token type distribution | |
| output.append("\n### Token Type Distribution") | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| type_counts = Counter(token["type"] for token in result["tokens"]) | |
| type_display = [f"{type_}: {count}" for type_, count in type_counts.items()] | |
| output.append(f"**{result['model']}**: {', '.join(type_display)}") | |
| # Subword analysis | |
| output.append("\n### Subword Analysis") | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| subwords = [token for token in result["tokens"] if token["is_subword"]] | |
| subword_ratio = ( | |
| len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0 | |
| ) | |
| output.append( | |
| f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)" | |
| ) | |
| return "\n".join(output) | |
| def create_efficiency_chart(results): | |
| if not results: | |
| return None | |
| models = [] | |
| token_counts = [] | |
| compression_ratios = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| models.append(result["model"]) | |
| token_counts.append(result["token_count"]) | |
| compression_ratios.append(result["compression_ratio"]) | |
| if not models: | |
| return None | |
| fig = go.Figure() | |
| # Add token count bars | |
| fig.add_trace( | |
| go.Bar( | |
| x=models, | |
| y=token_counts, | |
| name="Token Count", | |
| marker_color="lightblue", | |
| text=token_counts, | |
| textposition="auto", | |
| ) | |
| ) | |
| fig.update_layout( | |
| title="Token Count Comparison (Lower = More Efficient)", | |
| xaxis_title="Tokenizer", | |
| yaxis_title="Number of Tokens", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| def create_token_distribution_chart(results): | |
| if not results: | |
| return None | |
| all_data = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| type_counts = Counter(token["type"] for token in result["tokens"]) | |
| for token_type, count in type_counts.items(): | |
| all_data.append( | |
| { | |
| "Tokenizer": result["model"], | |
| "Token Type": token_type, | |
| "Count": count, | |
| } | |
| ) | |
| if not all_data: | |
| return None | |
| df = pd.DataFrame(all_data) | |
| fig = px.bar( | |
| df, | |
| x="Tokenizer", | |
| y="Count", | |
| color="Token Type", | |
| title="Token Type Distribution by Tokenizer", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| # Custom CSS for better styling | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .token-display { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| background: #f8f9fa; | |
| padding: 8px; | |
| border-radius: 4px; | |
| font-size: 0.9em; | |
| } | |
| """ | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🔤 Advanced Tokenizer Comparison Tool | |
| Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types. | |
| **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space | |
| 💡 **Try the sample texts** to see how tokenizers handle different challenges like: | |
| - Mixed languages and scripts | |
| - Programming code and JSON | |
| - Long compound words | |
| - Special characters and emojis | |
| - Technical terminology | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| # Sample texts dropdown | |
| pre_choices = [ | |
| "Custom text (enter below)", | |
| """ | |
| ᴾʸᵗʰᵒⁿ | |
| ₚᵧₜₕₒₙ | |
| P̲y̲t̲h̲o̲n̲ | |
| P̄ȳt̄h̄ōn̄ | |
| P̅y̅t̅h̅o̅n̅ | |
| ⓅⓎⓉⒽⓄⓃ | |
| ⒫⒴⒯⒣⒪⒩ | |
| 🄿🅈🅃🄷🄾🄽 | |
| ⓅⓎⓉⒽⓄⓃ | |
| Python | |
| Pʎʇɥou | |
| Pyʇɥou | |
| P̊ẙt̊h̊o̊n̊ | |
| Pëthøñ | |
| P̶y̶t̶h̶o̶n̶ | |
| P̸y̸t̸h̸o̸n̸ | |
| P̷y̷t̷h̷o̷n̷ | |
| P̴y̴t̴h̴o̴n̴ | |
| 𝒫𝓎𝓉𝒽𝑜𝓃 | |
| ℙ𝕪𝕥𝕙𝕠𝕟 | |
| """, | |
| "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.", | |
| "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.", | |
| "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.", | |
| "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.", | |
| "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56,价格为789美元。", | |
| "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.", | |
| "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।", | |
| "code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)", | |
| "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)", | |
| "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789", | |
| "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!", | |
| "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism", | |
| "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100", | |
| "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.", | |
| "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.", | |
| "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈", | |
| "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft", | |
| 'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}', | |
| "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.", | |
| ] | |
| sample_texts = gr.Dropdown( | |
| choices=pre_choices, | |
| value="Custom text (enter below)", | |
| label="Choose a sample text or enter your own", | |
| interactive=True, | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to tokenize", | |
| placeholder="Enter your text here or select a sample above...", | |
| lines=4, | |
| value=pre_choices[1], | |
| ) | |
| with gr.Column(scale=1): | |
| with gr.Tabs(): | |
| with gr.TabItem("Models"): | |
| model_selector = gr.CheckboxGroup( | |
| choices=available_tokenizers, | |
| value=pre_selected_tokenizers, | |
| label="Select tokenizers to compare...", | |
| ) | |
| show_details = gr.Checkbox( | |
| label="Show detailed analysis", value=False | |
| ) | |
| with gr.TabItem("Normalization"): | |
| normalization_method = gr.Dropdown( | |
| choices=[method[0] for method in get_normalization_methods()], | |
| value="none", | |
| label="Normalization Method", | |
| ) | |
| show_normalization = gr.Checkbox( | |
| label="Show normalized results", value=False | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| efficiency_output = gr.Markdown( | |
| label="Efficiency Ranking", | |
| value="Enter text above to see efficiency comparison...", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| tokenization_display = gr.HTML( | |
| label="Interactive Tokenization (Hover to highlight across tokenizers)", | |
| value="<p>Enter text above to see interactive tokenization...</p>", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| normalized_display = gr.HTML( | |
| label="Normalized Tokenization", | |
| value="<p>Enable normalization to see results...</p>", | |
| visible=False, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| token_ids_output = gr.Markdown( | |
| label="Token IDs", value="Token IDs will appear here..." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| detailed_output = gr.Markdown(label="Detailed Analysis", visible=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| efficiency_chart = gr.Plot(label="Efficiency Comparison") | |
| with gr.Column(): | |
| distribution_chart = gr.Plot(label="Token Type Distribution") | |
| # Function to update text input when sample is selected | |
| def update_text_from_sample(sample_choice): | |
| if sample_choice == "Custom text (enter below)": | |
| return gr.update() # Don't change the text input | |
| else: | |
| # Extract the text after the colon | |
| sample_text = ( | |
| sample_choice.split(": ", 1)[1] | |
| if ": " in sample_choice | |
| else sample_choice | |
| ) | |
| return gr.update(value=sample_text) | |
| # Update text input when sample is selected | |
| sample_texts.change( | |
| fn=update_text_from_sample, inputs=sample_texts, outputs=text_input | |
| ) | |
| # Main comparison function | |
| def update_comparison_with_norm(text, models, details, norm_method, show_norm): | |
| if normalization_method == "none" or not show_norm: | |
| # Original behavior | |
| ( | |
| efficiency, | |
| tokenization_html, | |
| token_ids, | |
| detailed, | |
| eff_chart, | |
| dist_chart, | |
| ) = compare_tokenizers(text, models, details) | |
| return ( | |
| efficiency, | |
| tokenization_html, | |
| token_ids, | |
| detailed, | |
| eff_chart, | |
| dist_chart, | |
| ) | |
| else: | |
| # With normalization | |
| original_results, normalized_results, normalized_text = ( | |
| compare_with_normalization(text, models, norm_method, details) | |
| ) | |
| # Generate displays for both | |
| orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results) | |
| norm_eff, norm_html, norm_ids = generate_basic_comparison( | |
| normalized_results | |
| ) | |
| # Combine or show separately | |
| combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}" | |
| return ( | |
| orig_eff, | |
| gr.update(value=combined_html, visible=True), | |
| orig_ids, | |
| "", | |
| None, | |
| None, | |
| ) | |
| def update_comparison(text, models, details): | |
| efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = ( | |
| compare_tokenizers(text, models, details) | |
| ) | |
| return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart | |
| # Auto-update on changes | |
| for component in [ | |
| text_input, | |
| model_selector, | |
| show_details, | |
| normalization_method, | |
| show_normalization, | |
| ]: | |
| component.change( | |
| fn=update_comparison_with_norm, | |
| inputs=[ | |
| text_input, | |
| model_selector, | |
| show_details, | |
| normalization_method, | |
| show_normalization, | |
| ], | |
| outputs=[ | |
| efficiency_output, | |
| tokenization_display, | |
| token_ids_output, | |
| detailed_output, | |
| efficiency_chart, | |
| distribution_chart, | |
| ], | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About the Models | |
| - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding) | |
| - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE) | |
| - **Gemma-2**: Google's model with SentencePiece (though HuggingFace uses BPE) | |
| - **Qwen3/2.5**: Alibaba's models with BPE | |
| - **BERT/DistilBERT**: Google's models with WordPiece | |
| - **BLOOM**: BigScience's multilingual model with BPE | |
| - **Aya Expanse**: Cohere's multilingual model with SentencePiece | |
| - **Comma (Common Pile)**: Common Pile's model with BPE | |
| - **Byt5**: Google's byte-level model | |
| ### Features | |
| - **Efficiency Ranking**: Compare token counts across models | |
| - **Subword Analysis**: See how models handle subwords | |
| - **Token Types**: Classification of word/number/punctuation tokens | |
| - **Visual Charts**: Interactive plots for comparison | |
| - **Detailed Analysis**: Common tokens and distribution stats | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |