Linker1907's picture
add mascot and starred bench from lighteval metatdata
2e5b82a
raw
history blame
21 kB
"""
Gradio dashboard to explore Lighteval tasks.
Loads tasks from the lighteval Registry and displays them in a searchable,
filterable interface.
"""
import re
from collections import Counter
from dataclasses import dataclass, field
import gradio as gr
from lighteval.tasks.registry import Registry
registry = Registry(custom_tasks=None, load_multilingual=True)
modules_data = registry.get_tasks_dump()
@dataclass
class TaskDoc:
module: str
abstract: str
languages: list[str]
tags: list[str]
paper: str | None
dataset: str | None
name: str | None = None
task_names: list[str] = field(default_factory=list)
starred: bool = False
def _module_to_github_path(module: str) -> str:
"""Convert module path to GitHub source URL path."""
if module.startswith("lighteval."):
mod_path_parts = module[len("lighteval."):].split(".")
return "src/lighteval/" + "/".join(mod_path_parts) + ".py"
return "src/lighteval/" + module.replace(".", "/") + ".py"
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
"""Load tasks from registry and build index."""
docs: list[TaskDoc] = []
language_counts: Counter = Counter()
tag_set: set = set()
for entry in modules_data:
docstring = entry.get("docstring", {})
module = entry.get("module", "")
# Extract fields from docstring
abstract = docstring.get("abstract", "").strip()
langs = [lang.lower() for lang in docstring.get("languages", [])]
tgs = [t.lower() for t in docstring.get("tags", [])]
paper = docstring.get("paper", "").strip() or None
name = docstring.get("name", "").strip() or None
starred = docstring.get("starred", False)
# Convert dataset array to comma-separated string
dataset_list = docstring.get("dataset", [])
dataset = ", ".join(dataset_list) if dataset_list else None
# Extract task names from tasks array
tasks_list = entry.get("tasks", [])
task_names = [task.get("name", "") for task in tasks_list if task.get("name")]
# Update counters
for lang in langs:
language_counts[lang] += 1
for t in tgs:
tag_set.add(t)
docs.append(TaskDoc(
module=module,
abstract=abstract,
languages=langs,
tags=tgs,
paper=paper,
dataset=dataset,
name=name,
task_names=task_names,
starred=starred
))
languages_sorted = [
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
]
tags_sorted = sorted(tag_set)
return docs, languages_sorted, tags_sorted
ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks()
TOP_LANGS = ALL_LANGS[:8]
def normalize_name_for_matching(name: str) -> str:
"""Normalize name for comparison: lowercase, remove underscores/spaces/colons."""
return re.sub(r"[_\s:]+", "", name.lower())
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
"""Filter tasks by languages, tags, and search query."""
selected_langs = [lang.lower() for lang in (languages or [])]
selected_tags = [t.lower() for t in (tags or [])]
search_lc = (search or "").strip().lower()
out: list[TaskDoc] = []
for td in ALL_TASKS:
if selected_langs and not any(lang in td.languages for lang in selected_langs):
continue
if selected_tags and not any(t in td.tags for t in selected_tags):
continue
if search_lc:
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
if search_lc not in hay:
continue
out.append(td)
out.sort(key=lambda td: (not td.starred, (td.name or td.module).lower()))
return out
def truncate_text(text: str, max_length: int = 250) -> str:
"""Truncate text to max_length, breaking at word boundary if possible."""
if len(text) <= max_length:
return text
truncated = text[:max_length]
last_space = truncated.rfind(" ")
if last_space > max_length * 0.7:
truncated = truncated[:last_space]
return truncated + "..."
def group_task_names_by_prefix(task_names: list[str]) -> list[str]:
"""Group task names by prefix (part before colon).
If multiple tasks share the same prefix, only show the prefix once.
Tasks without a colon are shown as-is.
Preserves original order as much as possible.
"""
prefix_groups: dict[str, list[str]] = {}
prefix_first_pos: dict[str, int] = {} # Track first occurrence position
result: list[tuple[int, str]] = [] # (position, name) tuples
for pos, task_name in enumerate(task_names):
if ":" in task_name:
prefix = task_name.split(":")[0]
if prefix not in prefix_groups:
prefix_groups[prefix] = []
prefix_first_pos[prefix] = pos
prefix_groups[prefix].append(task_name)
else:
# Standalone task - add directly at its position
result.append((pos, task_name))
# Process prefix groups
for prefix, tasks in prefix_groups.items():
pos = prefix_first_pos[prefix]
if len(tasks) > 1:
# Multiple tasks share this prefix - show only the prefix
result.append((pos, prefix))
else:
# Only one task with this prefix - show the full task name
result.append((pos, tasks[0]))
# Sort by position to preserve original order
result.sort(key=lambda x: x[0])
return [name for _, name in result]
def render_cards(tasks: list[TaskDoc]) -> str:
"""Render task cards as HTML."""
items: list[str] = []
for t in tasks:
# Get display name
module_parts = t.module.split(".")
base_no_ext = module_parts[-1] if module_parts else ""
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
task_name = (t.name or fallback_name).replace("_", " ").title()
# Build source link
mod_path = _module_to_github_path(t.module)
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
abstract_text = t.abstract or "-"
abstract_text = truncate_text(abstract_text)
abstract_html = abstract_text.replace("\n", "<br/>")
sep_html = ' <span class="sep">|</span> ' if paper_html else ""
links_html = f"{source_html}{sep_html}{paper_html}"
dataset_links = []
if t.dataset:
datasets = [d.strip() for d in t.dataset.split(",") if d.strip()]
for ds in datasets[:6]:
dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
if len(datasets) > 6:
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
dataset_html = " ".join(dataset_links) if dataset_links else ""
star_icon = "⭐ " if t.starred else ""
# Display evaluation task names (max 3 visible, with dropdown for more)
# Group task names by prefix to collapse shared prefixes
task_names_html = ""
if t.task_names:
grouped_names = group_task_names_by_prefix(t.task_names)
visible_names = grouped_names[:3]
remaining_names = grouped_names[3:]
visible_html = " ".join([f'<span class="task-name">{name}</span>' for name in visible_names])
if remaining_names:
remaining_html = " ".join([f'<span class="task-name">{name}</span>' for name in remaining_names])
task_names_html = f'''
<div class="task-names">
<div class="task-names-label">Run using lighteval:</div>
<div class="task-names-list">{visible_html}</div>
<details class="task-names-details">
<summary class="task-names-summary">Show {len(remaining_names)} more</summary>
<div class="task-names-list task-names-remaining">{remaining_html}</div>
</details>
</div>
'''
else:
task_names_html = f'<div class="task-names"><div class="task-names-label">Run using lighteval:</div><div class="task-names-list">{visible_html}</div></div>'
items.append(
f"""
<article class="card" tabindex="0" aria-label="Task {task_name}">
<div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
{chips_tags_html}
{chips_langs_html}
<div class="abstract">{abstract_html}</div>
{task_names_html}
<div class="links">{links_html}</div>
</article>
"""
)
return "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"
def on_filter(languages: list[str], tags: list[str], search: str):
tasks = filter_tasks(languages, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return counter_text, render_cards(tasks)
def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
choices = ALL_LANGS if show_all else TOP_LANGS
kept = [lang for lang in (selected_langs or []) if lang in choices]
tasks = filter_tasks(kept, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
"""Toggle tag filter visibility while preserving selections."""
tags_value: list[str] = selected_tags or []
tasks = filter_tasks(languages, tags_value, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
# Custom CSS for the app
custom_css = """
/* layout */
.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 20px; margin-top: 10px; }
/* card base */
.card {
border-radius: 16px;
padding: 18px;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
outline: none;
position: relative;
overflow: hidden;
border: 2px solid transparent;
}
.card::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(
90deg,
transparent,
rgba(255, 255, 255, 0.1),
transparent
);
transition: left 0.5s;
}
.card:hover::before {
left: 100%;
}
.card:hover, .card:focus {
transform: translateY(-6px) scale(1.02);
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.12), 0 8px 16px rgba(0, 0, 0, 0.08);
}
.title {
display: flex;
align-items: center;
gap: 8px;
flex-wrap: wrap;
position: relative;
z-index: 1;
}
.title-text {
font-weight: 700;
font-size: 17px;
font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial;
letter-spacing: -0.01em;
}
.dataset-inline {
font-size: 12px;
display: flex;
flex-wrap: wrap;
gap: 6px;
align-items: center;
margin-left: 8px;
}
.chips {
margin: 8px 0 6px 0;
display: flex;
gap: 4px;
flex-wrap: wrap;
}
.chips-tags { margin: 8px 0 4px 0; }
.chips-langs { margin: 4px 0 6px 0; }
.chip {
display: inline-block;
padding: 4px 10px;
border-radius: 12px;
font-size: 11px;
font-weight: 500;
background: linear-gradient(135deg, #e6f2ff 0%, #d6e9ff 100%);
color: #1e3a8a;
transition: all 0.2s ease;
border: 1px solid rgba(30, 58, 138, 0.1);
}
.chip:hover {
transform: translateY(-1px);
box-shadow: 0 2px 8px rgba(30, 58, 138, 0.2);
}
.chip-lang {
background: linear-gradient(135deg, #e8f5e9 0%, #d4edda 100%);
color: #166534;
border-color: rgba(22, 101, 52, 0.1);
}
.chip-lang:hover {
box-shadow: 0 2px 8px rgba(22, 101, 52, 0.2);
}
.abstract {
color: #475569;
font-size: 13.5px;
line-height: 1.6;
margin-top: 8px;
min-height: 48px;
}
.task-names {
margin-top: 10px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.15);
}
.task-names-label {
font-size: 11px;
font-weight: 600;
color: #64748b;
margin-bottom: 6px;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.task-names-list {
display: flex;
flex-wrap: wrap;
gap: 6px;
}
.task-names-remaining {
margin-top: 8px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.15);
}
.task-names-details {
margin-top: 8px;
}
.task-names-summary {
font-size: 11px;
font-weight: 600;
color: #64748b;
cursor: pointer;
user-select: none;
padding: 4px 8px;
border-radius: 4px;
display: inline-block;
transition: all 0.2s ease;
background: rgba(148, 163, 184, 0.1);
}
.task-names-summary:hover {
background: rgba(148, 163, 184, 0.2);
color: #475569;
}
.task-names-summary::-webkit-details-marker {
display: none;
}
.task-names-details[open] .task-names-summary {
margin-bottom: 8px;
}
.task-name {
display: inline-block;
padding: 3px 8px;
border-radius: 6px;
font-size: 11px;
font-weight: 500;
background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
color: #92400e;
border: 1px solid rgba(146, 64, 14, 0.2);
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
transition: all 0.2s ease;
}
.task-name:hover {
transform: translateY(-1px);
box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2);
background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%);
}
.links {
margin-top: 12px;
font-size: 12px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.2);
}
.links a {
text-decoration: none;
font-weight: 600;
transition: all 0.2s ease;
position: relative;
}
.links a::after {
content: '';
position: absolute;
width: 0;
height: 2px;
bottom: -2px;
left: 0;
background: currentColor;
transition: width 0.3s ease;
}
.links a:hover::after {
width: 100%;
}
.links a:hover {
transform: translateX(2px);
}
.sep { color: #94a3b8; margin: 0 8px; }
.dataset {
display: inline-block;
font-size: 12px;
color: #0ea5e9;
background: linear-gradient(135deg, #ecfeff 0%, #e0f7fa 100%);
padding: 4px 10px;
border-radius: 8px;
text-decoration: none;
transition: all 0.2s ease;
border: 1px solid rgba(14, 165, 233, 0.2);
font-weight: 500;
white-space: nowrap;
}
.dataset:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3);
background: linear-gradient(135deg, #e0f7fa 0%, #d1f2eb 100%);
}
.dataset-more {
display: inline-block;
font-size: 12px;
color: #64748b;
background: linear-gradient(135deg, #f1f5f9 0%, #e2e8f0 100%);
padding: 4px 10px;
border-radius: 8px;
font-weight: 500;
white-space: nowrap;
}
/* Light mode */
:root {
--bg-start: #f8fafc;
--bg-end: #f1f5f9;
--card-bg: #ffffff;
--card-border: rgba(226, 232, 240, 0.8);
--title-color: #1e3a8a;
--text-color: #0f172a;
--muted: #475569;
--link: #2563eb;
}
/* Dark mode overrides */
@media (prefers-color-scheme: dark) {
:root {
--bg-start: #0b1220;
--bg-end: #0f172a;
--card-bg: #071022;
--card-border: rgba(15, 42, 68, 0.8);
--title-color: #93c5fd;
--text-color: #e6eef8;
--muted: #cbd5e1;
--link: #6ea8ff;
}
.dataset-more {
color: #94a3b8;
background: linear-gradient(135deg, rgba(148, 163, 184, 0.15) 0%, rgba(148, 163, 184, 0.1) 100%);
}
.chips-tags .chip {
background: linear-gradient(135deg, rgba(29, 78, 216, 0.35) 0%, rgba(29, 78, 216, 0.25) 100%);
color: #e6eef8;
border: 1px solid rgba(148, 163, 184, 0.15);
}
.chips-langs .chip {
background: linear-gradient(135deg, rgba(22, 101, 52, 0.35) 0%, rgba(22, 101, 52, 0.25) 100%);
color: #e6eef8;
border: 1px solid rgba(148, 163, 184, 0.15);
}
.links {
border-top-color: rgba(148, 163, 184, 0.3);
}
.task-names {
border-top-color: rgba(148, 163, 184, 0.25);
}
.task-names-label {
color: #94a3b8;
}
.task-name {
background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%);
color: #fbbf24;
border-color: rgba(146, 64, 14, 0.3);
}
.task-name:hover {
background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%);
box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3);
}
.task-names-summary {
background: rgba(148, 163, 184, 0.15);
color: #94a3b8;
}
.task-names-summary:hover {
background: rgba(148, 163, 184, 0.25);
color: #cbd5e1;
}
.task-names-remaining {
border-top-color: rgba(148, 163, 184, 0.25);
}
}
/* apply */
body {
background: linear-gradient(135deg, var(--bg-start) 0%, var(--bg-end) 100%);
background-attachment: fixed;
color: var(--text-color);
min-height: 100vh;
}
.card {
background: var(--card-bg);
border: 2px solid var(--card-border);
color: var(--text-color);
backdrop-filter: blur(10px);
}
.title-text { color: var(--title-color); }
.abstract { color: var(--muted); }
.links a { color: var(--link); }
/* small screens adjustments */
@media (max-width: 520px) {
.cards-grid {
gap: 12px;
grid-template-columns: 1fr;
}
.title-text { font-size: 16px; }
.card { padding: 14px; }
}
"""
with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
with gr.Row():
with gr.Column():
gr.Markdown(
"""
<h2 style="margin:6px 0 2px 0;"><a href="https://github.com/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
"""
)
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
with gr.Row(equal_height=False):
with gr.Column(scale=2):
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="")
with gr.Group():
gr.Markdown("**Languages**")
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
with gr.Group():
gr.Markdown("**Benchmark type**")
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
gr.Markdown("Tip: use the filters and search together. Results update live.")
gr.Image(
value="measuring_model_size.png",
label="",
show_label=False,
container=False,
show_download_button=False
)
with gr.Column(scale=5):
cards = gr.HTML()
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
initial_tasks = filter_tasks([], [], "")
cards.value = render_cards(initial_tasks)
if __name__ == "__main__":
demo.launch()