Spaces:
Running
Running
Commit
·
b615923
1
Parent(s):
d799cb2
add principles and technical requirements mapping
Browse files- README.md +1 -1
- app.py +336 -140
- requirements.txt +2 -2
- results/01-ai/Yi-34B-Chat.json +87 -0
- results/Claude3Opus/result_Claude3Opus.json +87 -0
- results/Qwen/Qwen1.5-72B-Chat.json +87 -0
- results/google/gemma-2-9b.json +81 -0
- results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json +87 -0
- results/gpt-4-1106-preview/result_gpt-4-1106-preview.json +87 -0
- results/meta-llama/Llama-2-13b-chat-hf.json +87 -0
- results/meta-llama/Llama-2-70b-chat-hf.json +87 -0
- results/meta-llama/Llama-2-7b-chat-hf.json +87 -0
- results/mistralai/Mistral-7B-Instruct-v0.2.json +87 -0
- results/mistralai/Mixtral-8x7B-Instruct-v0.1.json +87 -0
- src/display/about.py +7 -8
- src/display/css_html_js.py +164 -0
- src/display/utils.py +1 -1
- src/envs.py +1 -3
- src/leaderboard/read_evals.py +18 -18
- src/populate.py +2 -2
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🥇
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: green
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.4.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
from huggingface_hub import snapshot_download
|
| 5 |
|
| 6 |
from src.display.about import (
|
| 7 |
CITATION_BUTTON_LABEL,
|
| 8 |
CITATION_BUTTON_TEXT,
|
| 9 |
EVALUATION_QUEUE_TEXT,
|
| 10 |
-
INTRODUCTION_TEXT,
|
| 11 |
-
LLM_BENCHMARKS_TEXT,
|
| 12 |
TITLE,
|
| 13 |
)
|
| 14 |
from src.display.css_html_js import custom_css
|
|
@@ -25,23 +25,11 @@ from src.display.utils import (
|
|
| 25 |
WeightType,
|
| 26 |
Precision
|
| 27 |
)
|
| 28 |
-
from src.envs import
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
| 31 |
-
import time
|
| 32 |
-
import requests
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
restart = False
|
| 37 |
-
while not restart:
|
| 38 |
-
try:
|
| 39 |
-
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
| 40 |
-
except requests.exceptions.ConnectionError as e:
|
| 41 |
-
print("Restart failed. Re-trying...")
|
| 42 |
-
time.sleep(30)
|
| 43 |
-
continue
|
| 44 |
-
restart = True
|
| 45 |
|
| 46 |
|
| 47 |
try:
|
|
@@ -50,14 +38,8 @@ try:
|
|
| 50 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 51 |
)
|
| 52 |
except Exception:
|
| 53 |
-
restart_space()
|
| 54 |
-
|
| 55 |
-
print(EVAL_RESULTS_PATH)
|
| 56 |
-
snapshot_download(
|
| 57 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 58 |
-
)
|
| 59 |
-
except Exception:
|
| 60 |
-
restart_space()
|
| 61 |
|
| 62 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 63 |
leaderboard_df = original_df.copy()
|
|
@@ -83,6 +65,23 @@ def update_table(
|
|
| 83 |
return df
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 87 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
| 88 |
|
|
@@ -139,112 +138,307 @@ def filter_models(
|
|
| 139 |
return filtered_df
|
| 140 |
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
|
| 148 |
-
with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
|
| 149 |
-
with gr.Row():
|
| 150 |
-
with gr.Column():
|
| 151 |
-
with gr.Row():
|
| 152 |
-
search_bar = gr.Textbox(
|
| 153 |
-
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
| 154 |
-
show_label=False,
|
| 155 |
-
elem_id="search-bar",
|
| 156 |
-
)
|
| 157 |
-
with gr.Row():
|
| 158 |
-
shown_columns = gr.CheckboxGroup(
|
| 159 |
-
choices=[
|
| 160 |
-
c.name
|
| 161 |
-
for c in fields(AutoEvalColumn)
|
| 162 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
| 163 |
-
],
|
| 164 |
-
value=[
|
| 165 |
-
c.name
|
| 166 |
-
for c in fields(AutoEvalColumn)
|
| 167 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 168 |
-
],
|
| 169 |
-
label="Select columns to show",
|
| 170 |
-
elem_id="column-select",
|
| 171 |
-
interactive=True,
|
| 172 |
-
)
|
| 173 |
-
with gr.Row():
|
| 174 |
-
with gr.Column(min_width=250):
|
| 175 |
-
# with gr.Box(elem_id="box-filter"):
|
| 176 |
-
filter_columns_type = gr.CheckboxGroup(
|
| 177 |
-
label="Model types",
|
| 178 |
-
choices=[t.to_str() for t in ModelType],
|
| 179 |
-
value=[t.to_str() for t in ModelType],
|
| 180 |
-
interactive=True,
|
| 181 |
-
elem_id="filter-columns-type",
|
| 182 |
-
)
|
| 183 |
-
# filter_columns_precision = gr.CheckboxGroup(
|
| 184 |
-
# label="Precision",
|
| 185 |
-
# choices=[i.value.name for i in Precision],
|
| 186 |
-
# value=[i.value.name for i in Precision],
|
| 187 |
-
# interactive=True,
|
| 188 |
-
# elem_id="filter-columns-precision",
|
| 189 |
-
# )
|
| 190 |
-
# filter_columns_size = gr.CheckboxGroup(
|
| 191 |
-
# label="Model sizes (in billions of parameters)",
|
| 192 |
-
# choices=list(NUMERIC_INTERVALS.keys()),
|
| 193 |
-
# value=list(NUMERIC_INTERVALS.keys()),
|
| 194 |
-
# interactive=True,
|
| 195 |
-
# elem_id="filter-columns-size",
|
| 196 |
-
# )
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
)
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
)
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
search_bar,
|
| 227 |
-
],
|
| 228 |
-
leaderboard_table,
|
| 229 |
)
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
hidden_leaderboard_table_for_search,
|
| 236 |
-
shown_columns,
|
| 237 |
-
filter_columns_type,
|
| 238 |
-
# filter_columns_precision,
|
| 239 |
-
# filter_columns_size,
|
| 240 |
-
# deleted_models_visibility,
|
| 241 |
-
search_bar,
|
| 242 |
-
],
|
| 243 |
-
leaderboard_table,
|
| 244 |
-
queue=True,
|
| 245 |
-
)
|
| 246 |
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
with gr.Column():
|
| 249 |
with gr.Row():
|
| 250 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
@@ -309,22 +503,24 @@ with demo:
|
|
| 309 |
],
|
| 310 |
submission_result,
|
| 311 |
)
|
| 312 |
-
with gr.Row():
|
| 313 |
-
with gr.Accordion("📖 FAQ", open=False):
|
| 314 |
-
with gr.Column(min_width=250):
|
| 315 |
-
gr.Markdown("""
|
| 316 |
-
#### What does N/A score mean?
|
| 317 |
-
|
| 318 |
-
An N/A score means that it was not possible to evaluate the benchmark for a given model.
|
| 319 |
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
- The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
|
| 323 |
-
- The model API refuses to provide any answer,
|
| 324 |
-
- We do not have access to the training data.
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
|
| 327 |
-
""")
|
| 328 |
with gr.Row():
|
| 329 |
with gr.Accordion("📙 Citation", open=False):
|
| 330 |
citation_button = gr.Textbox(
|
|
@@ -335,7 +531,7 @@ with demo:
|
|
| 335 |
show_copy_button=True,
|
| 336 |
)
|
| 337 |
|
| 338 |
-
scheduler = BackgroundScheduler()
|
| 339 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 340 |
-
scheduler.start()
|
| 341 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 1 |
+
import functools
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
from huggingface_hub import snapshot_download
|
| 7 |
|
| 8 |
from src.display.about import (
|
| 9 |
CITATION_BUTTON_LABEL,
|
| 10 |
CITATION_BUTTON_TEXT,
|
| 11 |
EVALUATION_QUEUE_TEXT,
|
|
|
|
|
|
|
| 12 |
TITLE,
|
| 13 |
)
|
| 14 |
from src.display.css_html_js import custom_css
|
|
|
|
| 25 |
WeightType,
|
| 26 |
Precision
|
| 27 |
)
|
| 28 |
+
from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
EVAL_RESULTS_PATH = str(Path(__file__).resolve().parent / "results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
try:
|
|
|
|
| 38 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 39 |
)
|
| 40 |
except Exception:
|
| 41 |
+
# restart_space()
|
| 42 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
| 45 |
leaderboard_df = original_df.copy()
|
|
|
|
| 65 |
return df
|
| 66 |
|
| 67 |
|
| 68 |
+
def update_principles_table(
|
| 69 |
+
df,
|
| 70 |
+
*args: list,
|
| 71 |
+
) -> pd.DataFrame:
|
| 72 |
+
columns = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
| 73 |
+
for shown_column in args:
|
| 74 |
+
if isinstance(shown_column, gr.components.CheckboxGroup):
|
| 75 |
+
columns.extend(shown_column.value)
|
| 76 |
+
else:
|
| 77 |
+
columns.extend(shown_column)
|
| 78 |
+
|
| 79 |
+
# dummy column for querying (not shown)
|
| 80 |
+
columns.append("model_name_for_query")
|
| 81 |
+
return df[columns]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
| 86 |
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
| 87 |
|
|
|
|
| 138 |
return filtered_df
|
| 139 |
|
| 140 |
|
| 141 |
+
BENCHMARKS_PER_CATEGORY = {
|
| 142 |
+
"Robustness and Predictability": [
|
| 143 |
+
"MMLU: Robustness",
|
| 144 |
+
"BoolQ Contrast Set",
|
| 145 |
+
"IMDB Contrast Set",
|
| 146 |
+
"Monotonicity Checks",
|
| 147 |
+
"Self-Check Consistency",
|
| 148 |
+
],
|
| 149 |
+
"Cyberattack Resilience": [
|
| 150 |
+
"Goal Hijacking and Prompt Leakage",
|
| 151 |
+
"Rule Following"
|
| 152 |
+
],
|
| 153 |
+
"Training Data Suitability": [
|
| 154 |
+
"Toxicity of the Dataset",
|
| 155 |
+
"Bias of the Dataset"
|
| 156 |
+
],
|
| 157 |
+
"No Copyright Infringement": [
|
| 158 |
+
"Copyrighted Material Memorization"
|
| 159 |
+
],
|
| 160 |
+
"User Privacy Protection": [
|
| 161 |
+
"PII Extraction by Association"
|
| 162 |
+
],
|
| 163 |
+
"Capabilities, Performance, and Limitations": [
|
| 164 |
+
"General Knowledge: MMLU",
|
| 165 |
+
"Reasoning: AI2 Reasoning Challenge",
|
| 166 |
+
"Common Sense Reasoning: HellaSwag",
|
| 167 |
+
"Truthfulness: TruthfulQA MC2",
|
| 168 |
+
"Coding: HumanEval"
|
| 169 |
+
],
|
| 170 |
+
"Interpretability": ["Logit Calibration: BIG-Bench", "Self-Assessment: TriviaQA"],
|
| 171 |
+
"Disclosure of AI": ["Denying Human Presence"],
|
| 172 |
+
"Traceability": ["Watermark Reliability & Robustness"],
|
| 173 |
+
"Representation — Absence of Bias": ["Representation Bias: RedditBias", "Prejudiced Answers: BBQ", "Biased Completions: BOLD"],
|
| 174 |
+
"Fairness — Absence of Discrimination":["Income Fairness: DecodingTrust", "Recommendation Consistency: FaiRLLM"],
|
| 175 |
+
"Harmful Content and Toxicity": ["Toxic Completions of Benign Text: RealToxicityPrompts", "Following Harmful Instructions: AdvBench"]
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
def _wrap_link(value: str, url: str) -> str:
|
| 179 |
+
return f"<a href={url} target='_blank'>{value}</a>"
|
| 180 |
+
|
| 181 |
+
TEXT_PER_CATEGORY = {
|
| 182 |
+
"Robustness and Predictability": f"We evaluate the model on state-of-the-art benchmarks that measure its robustness under various input alterations [{_wrap_link('1', 'https://aclanthology.org/2020.findings-emnlp.117/')}], and the level of consistency in its answers [{_wrap_link('2', 'https://arxiv.org/abs/2306.09983')}, {_wrap_link('3', 'https://arxiv.org/abs/2305.15852')}].",
|
| 183 |
+
"Cyberattack Resilience": f"We consider the concrete threats concerning just the LLM in isolation, focusing on its resilience to jailbreaks and prompt injection attacks [{_wrap_link('1', 'https://arxiv.org/abs/2311.01011')}, {_wrap_link('2', 'https://arxiv.org/abs/2311.04235')}, {_wrap_link('3', 'https://arxiv.org/abs/2312.02119')}].",
|
| 184 |
+
"Training Data Suitability": "We evaluate the adequacy of the dataset [1], aiming to assess the potential of an LLM trained on this data to exhibit toxic or discriminatory behavior.",
|
| 185 |
+
"No Copyright Infringement": "We check if the model can be made to directly regurgitate content that is subject to the copyright of a third person.",
|
| 186 |
+
"User Privacy Protection": "We focus on cases of user privacy violation by the LLM itself, evaluating the model’s ability to recover personal identifiable information that may have been included in the training data.",
|
| 187 |
+
"Capabilities, Performance, and Limitations": "To provide an overarching view, we assess the capabilities and limitations of the AI system by evaluating its performance on a wide range of tasks. We evaluate the model on widespread research benchmarks covering general knowledge [1], reasoning [2,3], truthfulness [4], and coding ability [5].",
|
| 188 |
+
"Interpretability": "The large body of machine learning interpretability research is often not easily applicable to large language models. While more work in this direction is needed, we use the existing easily-applicable methods to evaluate the model’s ability to reason about its own correctness [1], and the degree to which the probabilities it outputs can be interpreted [3,4].",
|
| 189 |
+
"Disclosure of AI": "We require the language model to consistently deny that it is a human.",
|
| 190 |
+
"Traceability": "We require the presence of language model watermarking [1,2], and evaluate its viability, combining several important requirements that such schemes must satisfy to be practical.",
|
| 191 |
+
"Representation — Absence of Bias": "We evaluate the tendency of the LLM to produce biased outputs, on three popular bias benchmarks [1,2,3].",
|
| 192 |
+
"Fairness — Absence of Discrimination": "We evaluate the model’s tendency to behave in a discriminatory way by comparing its behavior on different protected groups, using prominent fairness benchmarks [1,2].",
|
| 193 |
+
"Harmful Content and Toxicity": "We evaluate the models’ tendency to produce harmful or toxic content, leveraging two recent evaluation tools, RealToxicityPrompts and AdvBench [1,2]."
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
CATEGORIES_PER_PRINCIPLE = {
|
| 197 |
+
"Technical Robustness and Safety": ["Robustness and Predictability", "Cyberattack Resilience"],
|
| 198 |
+
"Privacy & Data Governance": ["Training Data Suitability", "No Copyright Infringement", "User Privacy Protection"],
|
| 199 |
+
"Transparency": ["Capabilities, Performance, and Limitations", "Interpretability", "Disclosure of AI", "Traceability"],
|
| 200 |
+
"Diversity, Non-discrimination & Fairness": ["Representation — Absence of Bias", "Fairness — Absence of Discrimination"],
|
| 201 |
+
"Social & Environmental Well-being": ["Harmful Content and Toxicity"]
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
ICON_PER_PRINCIPLE = {
|
| 205 |
+
"Technical Robustness and Safety": "https://compl-ai.org/icon_technical_robustness_and_safety.svg",
|
| 206 |
+
"Privacy & Data Governance": "https://compl-ai.org/icon_privacy_and_data_governance.svg",
|
| 207 |
+
"Transparency": "https://compl-ai.org/icon_transparency.svg",
|
| 208 |
+
"Diversity, Non-discrimination & Fairness": "https://compl-ai.org/icon_diversity_fairness.svg",
|
| 209 |
+
"Social & Environmental Well-being": "https://compl-ai.org/icon_social_environmental.svg",
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
def generate_benchmarks(principle: str):
|
| 213 |
+
with gr.Row():
|
| 214 |
+
gr.HTML(f"""
|
| 215 |
+
<h3 class="image_header principle_header"><img src="{ICON_PER_PRINCIPLE[principle]}" class="principle_icon"/>EU AI Act Principle: {principle}</h3>
|
| 216 |
+
""")
|
| 217 |
|
| 218 |
+
categories = CATEGORIES_PER_PRINCIPLE[principle]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
with gr.Row(elem_classes=["technical_requirements", "border_mid"]):
|
| 221 |
+
for category in categories:
|
| 222 |
+
with gr.Column():
|
| 223 |
+
gr.HTML(
|
| 224 |
+
f"""
|
| 225 |
+
<div style="padding: 10px 20px;">
|
| 226 |
+
<h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />{category}</h3>
|
| 227 |
+
<p>{TEXT_PER_CATEGORY[category]}</p>
|
| 228 |
+
</div>
|
| 229 |
+
"""
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
shown_columns = []
|
| 233 |
+
with gr.Row(elem_classes=["technical_requirements", "border_bot"]):
|
| 234 |
+
for category in categories:
|
| 235 |
+
with gr.Column():
|
| 236 |
+
shown_column = gr.CheckboxGroup(
|
| 237 |
+
show_label=False,
|
| 238 |
+
choices=BENCHMARKS_PER_CATEGORY[category],
|
| 239 |
+
value=BENCHMARKS_PER_CATEGORY[category],
|
| 240 |
+
interactive=True,
|
| 241 |
+
# elem_id="filter-columns-type",
|
| 242 |
+
)
|
| 243 |
+
shown_columns.append(shown_column)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
with gr.Row():
|
| 247 |
+
df = update_principles_table(leaderboard_df, *shown_columns)
|
| 248 |
+
type_per_column = {c.name: c.type for c in fields(AutoEvalColumn)}
|
| 249 |
+
datatypes = [type_per_column[name] for name in df.columns]
|
| 250 |
+
leaderboard_table = gr.components.Dataframe(
|
| 251 |
+
value=df,
|
| 252 |
+
headers=df.columns.tolist(),
|
| 253 |
+
datatype=datatypes,
|
| 254 |
+
elem_id="leaderboard-table",
|
| 255 |
+
interactive=False,
|
| 256 |
+
visible=True,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
for shown_column in shown_columns:
|
| 260 |
+
shown_column.change(
|
| 261 |
+
fn=functools.partial(update_principles_table, leaderboard_df),
|
| 262 |
+
inputs=shown_columns,
|
| 263 |
+
outputs=leaderboard_table,
|
| 264 |
+
# queue=True,
|
| 265 |
)
|
| 266 |
|
| 267 |
+
# Allows clicking on the full table column to trigger sorting
|
| 268 |
+
custom_js = """
|
| 269 |
+
function clickableTableHeaders() {
|
| 270 |
+
document.querySelectorAll(".table > thead > tr > th").forEach(th => {
|
| 271 |
+
th.addEventListener("click", () => {
|
| 272 |
+
const sortButton = th.querySelector(".sort-button"); // Selects the first child with class "sort-button"
|
| 273 |
+
if (sortButton) {
|
| 274 |
+
sortButton.click(); // Triggers the click event on the "sort-button" element
|
| 275 |
+
}
|
| 276 |
+
});
|
| 277 |
+
});
|
| 278 |
+
|
| 279 |
+
// Select all elements with the .table class
|
| 280 |
+
const tableElements = document.querySelectorAll('.table');
|
| 281 |
+
|
| 282 |
+
// Callback function to execute when mutations are observed
|
| 283 |
+
const mutationCallback = (mutationsList) => {
|
| 284 |
+
mutationsList.forEach((mutation) => {
|
| 285 |
+
if (mutation.target.nodeName == "TH" && mutation.addedNodes.length > 0) {
|
| 286 |
+
mutation.target.addEventListener("click", () => {
|
| 287 |
+
const sortButton = mutation.target.querySelector(".sort-button"); // Selects the first child with class "sort-button"
|
| 288 |
+
if (sortButton) {
|
| 289 |
+
sortButton.click(); // Triggers the click event on the "sort-button" element
|
| 290 |
+
}
|
| 291 |
+
});
|
| 292 |
+
}
|
| 293 |
+
});
|
| 294 |
+
};
|
| 295 |
+
|
| 296 |
+
// Options for the observer (which mutations to observe)
|
| 297 |
+
const observerOptions = {
|
| 298 |
+
childList: true, // Watch for additions/removals of child nodes
|
| 299 |
+
subtree: true // Watch for changes in descendants as well
|
| 300 |
+
};
|
| 301 |
+
|
| 302 |
+
// Create an instance of MutationObserver and pass in the callback function
|
| 303 |
+
const observer = new MutationObserver(mutationCallback);
|
| 304 |
+
|
| 305 |
+
// Observe each .table element
|
| 306 |
+
tableElements.forEach((tableElement) => {
|
| 307 |
+
observer.observe(tableElement, observerOptions);
|
| 308 |
+
});
|
| 309 |
+
}
|
| 310 |
+
"""
|
| 311 |
+
|
| 312 |
+
demo = gr.Blocks(
|
| 313 |
+
css=custom_css,
|
| 314 |
+
theme=gr.themes.Default(
|
| 315 |
+
font=gr.themes.GoogleFont("Open Sans", weights=(400, 500, 600))
|
| 316 |
+
),
|
| 317 |
+
js=custom_js,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
with demo:
|
| 321 |
+
gr.HTML(TITLE)
|
| 322 |
+
|
| 323 |
+
with gr.Row(elem_id="intro"):
|
| 324 |
+
with gr.Column(scale=1, min_width=20, elem_classes="empty"):
|
| 325 |
+
pass
|
| 326 |
+
with gr.Column(scale=5):
|
| 327 |
+
gr.HTML(
|
| 328 |
+
"""
|
| 329 |
+
<h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />Technical Interpretation of the EU AI Act</h3>
|
| 330 |
+
<p>We have interpreted the high-level regulatory requirements of the EU AI Act as concrete technical requirements. We further group requirements within six EU AI Act principles and label them as GPAI, GPAI+SR (Systemic Risk), and HR (High-Risk).</p>
|
| 331 |
+
<br/>
|
| 332 |
+
<a href="https://compl-ai.org/interpretation" class="button" target="_blank">Explore the Interpretation</a>
|
| 333 |
+
"""
|
| 334 |
)
|
| 335 |
+
with gr.Column(scale=5):
|
| 336 |
+
gr.HTML(
|
| 337 |
+
"""
|
| 338 |
+
<h3 class="image_header"><img src="https://compl-ai.org/checkmark.png" style="max-height:24px;" />Open-Source Benchmarking Suite</h3>
|
| 339 |
+
<p>The framework includes the ability to evaluate the technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open to community contributions.</p>
|
| 340 |
+
<br/>
|
| 341 |
+
<a href="https://github.com/compl-ai/compl-ai" class="button" target="_blank"><img src="https://compl-ai.org/icons/github-mark.svg" class="github_icon">GitHub Repo</a>
|
| 342 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 343 |
)
|
| 344 |
+
with gr.Column(scale=1, min_width=20, elem_classes="empty"):
|
| 345 |
+
pass
|
| 346 |
+
|
| 347 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 348 |
+
with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
+
for principle in CATEGORIES_PER_PRINCIPLE.keys():
|
| 351 |
+
generate_benchmarks(principle)
|
| 352 |
+
|
| 353 |
+
###
|
| 354 |
+
|
| 355 |
+
# with gr.Row():
|
| 356 |
+
# shown_columns = gr.CheckboxGroup(
|
| 357 |
+
# choices=[
|
| 358 |
+
# c.name
|
| 359 |
+
# for c in fields(AutoEvalColumn)
|
| 360 |
+
# if not c.hidden and not c.never_hidden and not c.dummy
|
| 361 |
+
# ],
|
| 362 |
+
# value=[
|
| 363 |
+
# c.name
|
| 364 |
+
# for c in fields(AutoEvalColumn)
|
| 365 |
+
# if c.displayed_by_default and not c.hidden and not c.never_hidden
|
| 366 |
+
# ],
|
| 367 |
+
# label="Select columns to show",
|
| 368 |
+
# elem_id="column-select",
|
| 369 |
+
# interactive=True,
|
| 370 |
+
# )
|
| 371 |
+
#
|
| 372 |
+
# with gr.Row():
|
| 373 |
+
# # with gr.Box(elem_id="box-filter"):
|
| 374 |
+
# filter_columns_type = gr.CheckboxGroup(
|
| 375 |
+
# label="Model types",
|
| 376 |
+
# choices=[t.to_str() for t in ModelType],
|
| 377 |
+
# value=[t.to_str() for t in ModelType],
|
| 378 |
+
# interactive=True,
|
| 379 |
+
# elem_id="filter-columns-type",
|
| 380 |
+
# )
|
| 381 |
+
#
|
| 382 |
+
# with gr.Row():
|
| 383 |
+
# search_bar = gr.Textbox(
|
| 384 |
+
# placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
| 385 |
+
# show_label=False,
|
| 386 |
+
# elem_id="search-bar",
|
| 387 |
+
# )
|
| 388 |
+
# # x = gr.Checkbox(show_label=False, label="foo")
|
| 389 |
+
#
|
| 390 |
+
# with gr.Row():
|
| 391 |
+
# # print(shown_columns.value)
|
| 392 |
+
# leaderboard_table = gr.components.Dataframe(
|
| 393 |
+
# value=leaderboard_df[
|
| 394 |
+
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
| 395 |
+
# + shown_columns.value
|
| 396 |
+
# ],
|
| 397 |
+
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 398 |
+
# datatype=TYPES,
|
| 399 |
+
# elem_id="leaderboard-table",
|
| 400 |
+
# interactive=False,
|
| 401 |
+
# visible=True,
|
| 402 |
+
# # column_widths=["2%", "30%", "10%", "10%", "12%"]
|
| 403 |
+
# )
|
| 404 |
+
#
|
| 405 |
+
# # Dummy leaderboard for handling the case when the user uses backspace key
|
| 406 |
+
# hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
| 407 |
+
# value=original_df[COLS],
|
| 408 |
+
# headers=COLS,
|
| 409 |
+
# datatype=TYPES,
|
| 410 |
+
# visible=False,
|
| 411 |
+
# )
|
| 412 |
+
# search_bar.submit(
|
| 413 |
+
# update_table,
|
| 414 |
+
# [
|
| 415 |
+
# hidden_leaderboard_table_for_search,
|
| 416 |
+
# shown_columns,
|
| 417 |
+
# filter_columns_type,
|
| 418 |
+
# # filter_columns_precision,
|
| 419 |
+
# # filter_columns_size,
|
| 420 |
+
# search_bar,
|
| 421 |
+
# ],
|
| 422 |
+
# leaderboard_table,
|
| 423 |
+
# )
|
| 424 |
+
# for selector in [shown_columns, filter_columns_type,
|
| 425 |
+
# ]:
|
| 426 |
+
# selector.change(
|
| 427 |
+
# update_table,
|
| 428 |
+
# [
|
| 429 |
+
# hidden_leaderboard_table_for_search,
|
| 430 |
+
# shown_columns,
|
| 431 |
+
# filter_columns_type,
|
| 432 |
+
# # filter_columns_precision,
|
| 433 |
+
# # filter_columns_size,
|
| 434 |
+
# # deleted_models_visibility,
|
| 435 |
+
# search_bar,
|
| 436 |
+
# ],
|
| 437 |
+
# leaderboard_table,
|
| 438 |
+
# queue=True,
|
| 439 |
+
# )
|
| 440 |
+
|
| 441 |
+
with gr.TabItem("🚀 Request Evaluation ", elem_id="llm-benchmark-tab-table", id=3):
|
| 442 |
with gr.Column():
|
| 443 |
with gr.Row():
|
| 444 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
|
| 503 |
],
|
| 504 |
submission_result,
|
| 505 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
+
with gr.TabItem("📖 FAQ ", elem_id="llm-benchmark-tab-table", id=4):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
+
with gr.Row():
|
| 510 |
+
# with gr.Accordion("📖 FAQ", open=True):
|
| 511 |
+
# with gr.Column(min_width=250):
|
| 512 |
+
gr.Markdown("""
|
| 513 |
+
#### What does N/A score mean?
|
| 514 |
+
|
| 515 |
+
An N/A score means that it was not possible to evaluate the benchmark for a given model.
|
| 516 |
+
|
| 517 |
+
This can happen for multiple reasons, such as:
|
| 518 |
+
|
| 519 |
+
- The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
|
| 520 |
+
- The model API refuses to provide any answer,
|
| 521 |
+
- We do not have access to the training data. """
|
| 522 |
+
)
|
| 523 |
|
|
|
|
| 524 |
with gr.Row():
|
| 525 |
with gr.Accordion("📙 Citation", open=False):
|
| 526 |
citation_button = gr.Textbox(
|
|
|
|
| 531 |
show_copy_button=True,
|
| 532 |
)
|
| 533 |
|
| 534 |
+
# scheduler = BackgroundScheduler()
|
| 535 |
+
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 536 |
+
# scheduler.start()
|
| 537 |
demo.queue(default_concurrency_limit=40).launch()
|
requirements.txt
CHANGED
|
@@ -2,8 +2,8 @@ APScheduler==3.10.1
|
|
| 2 |
black==23.11.0
|
| 3 |
click==8.1.3
|
| 4 |
datasets==2.14.5
|
| 5 |
-
gradio==
|
| 6 |
-
gradio_client==
|
| 7 |
huggingface-hub>=0.18.0
|
| 8 |
matplotlib==3.7.1
|
| 9 |
numpy==1.24.2
|
|
|
|
| 2 |
black==23.11.0
|
| 3 |
click==8.1.3
|
| 4 |
datasets==2.14.5
|
| 5 |
+
gradio==5.4.0
|
| 6 |
+
gradio_client==1.4.2
|
| 7 |
huggingface-hub>=0.18.0
|
| 8 |
matplotlib==3.7.1
|
| 9 |
numpy==1.24.2
|
results/01-ai/Yi-34B-Chat.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "01-ai/Yi-34B-Chat",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Yi-34B-Chat"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9307063195147172
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.6834721511354611
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.921922202494338
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9927596254738791
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.6688888888888889
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.8031496062992126
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.5666666666666667
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.84
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.89125
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.8039534592768672
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 1.0
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8403704441346346
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.32298136645962733
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.538659793814433
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.5828559926386013
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.6196885060946251
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.5538416743593192
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.7496795328300812
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.6544368600682594
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.3561643835616438
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.989
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.23216608444613182
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.7225454545454546
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/Claude3Opus/result_Claude3Opus.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "Claude3Opus",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/claude-3-opus-20240229"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9660153175746249
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.757401370038459
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": null
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9932693207159621
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.7785714285714285
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.8484107579462102
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": null
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": null
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": null
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": null
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.825
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.954
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.848
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.8402061855670103
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.7557223372440763
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": null
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": null
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.868
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.964
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 1.0
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 1.0
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.18613375956573336
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": null
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/Qwen/Qwen1.5-72B-Chat.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "Qwen/Qwen1.5-72B-Chat",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Qwen1.5-72B-Chat"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9599664585022551
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7200614975210675
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9695524579515152
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9976900071559403
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.6695121951219511
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.8449367088607596
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.8
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.48
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.78625
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.42761166020267727
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.6800000000000002
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8662617008564031
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.5726708074534161
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.45360824742268047
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.48560501495284103
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.8261042236875162
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.6389413688457826
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.7768836348098561
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.6825938566552902
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.726027397260274
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.988
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.06008018267012608
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.7525454545454544
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/google/gemma-2-9b.json
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "google/gemma-2-9b",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Google-gemma-2-9b"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9839603777871013
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7370530984096585
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.8414627564046196
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9254440618284906
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.46973684210526323
|
| 22 |
+
},
|
| 23 |
+
"boolq_contrast_robustness": {
|
| 24 |
+
"aggregate_score": 0.5833333333333334
|
| 25 |
+
},
|
| 26 |
+
"imdb_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.51
|
| 28 |
+
},
|
| 29 |
+
"calibration_big_bench": {
|
| 30 |
+
"aggregate_score": 0.80375
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench_i_know": {
|
| 33 |
+
"aggregate_score": 0.013649443262854999
|
| 34 |
+
},
|
| 35 |
+
"decoding_trust": {
|
| 36 |
+
"aggregate_score": 0.94
|
| 37 |
+
},
|
| 38 |
+
"hellaswag": {
|
| 39 |
+
"aggregate_score": 0.35590519816769567
|
| 40 |
+
},
|
| 41 |
+
"human_eval": {
|
| 42 |
+
"aggregate_score": 0.0
|
| 43 |
+
},
|
| 44 |
+
"instruction_goal_hijacking": {
|
| 45 |
+
"aggregate_score": 0.4858247422680413
|
| 46 |
+
},
|
| 47 |
+
"multiturn_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.499626178974005
|
| 49 |
+
},
|
| 50 |
+
"reddit_bias": {
|
| 51 |
+
"aggregate_score": 0.9823398039376345
|
| 52 |
+
},
|
| 53 |
+
"truthful_qa_mc2": {
|
| 54 |
+
"aggregate_score": 0.4643375169376196
|
| 55 |
+
},
|
| 56 |
+
"mmlu": {
|
| 57 |
+
"aggregate_score": 0.26356644352656317
|
| 58 |
+
},
|
| 59 |
+
"ai2_reasoning": {
|
| 60 |
+
"aggregate_score": 0.2295221843003413
|
| 61 |
+
},
|
| 62 |
+
"human_deception": {
|
| 63 |
+
"aggregate_score": 0.9863013698630136
|
| 64 |
+
},
|
| 65 |
+
"memorization": {
|
| 66 |
+
"aggregate_score": 0.996
|
| 67 |
+
},
|
| 68 |
+
"privacy": {
|
| 69 |
+
"aggregate_score": 1.0
|
| 70 |
+
},
|
| 71 |
+
"fairllm": {
|
| 72 |
+
"aggregate_score": 0.0
|
| 73 |
+
},
|
| 74 |
+
"mmlu_robustness": {
|
| 75 |
+
"aggregate_score": 0.26527272727272727
|
| 76 |
+
},
|
| 77 |
+
"training_data_suitability": {
|
| 78 |
+
"aggregate_score": null
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
}
|
results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "gpt-3.5-turbo-0125",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/gpt-3.5-turbo-0125"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.8798721047562755
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7320262917581274
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9388454625027057
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9905250918854458
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.6739999999999999
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.8208955223880597
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.65
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.545
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.95625
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.9075691229826097
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.835
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.855
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.762
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": null
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.6549516908212559
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": null
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": null
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.6811707733941034
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.930976430976431
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.589041095890411
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.992
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.07798395569162107
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.6952727272727273
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/gpt-4-1106-preview/result_gpt-4-1106-preview.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "gpt-4-1106-preview",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/gpt-4-1106-preview"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9825415205108432
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7385998936629464
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.969292480968859
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.996174184883085
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.7755813953488372
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.8656716417910448
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.8666666666666667
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.985
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 1.0
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.9538095833854623
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.8799999999999999
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.953
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.8370000000000001
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.6572164948453608
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.882740970784449
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": null
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": null
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.805
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.964
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.9726027397260274
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.998
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.12940239570245562
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.8049090909090908
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/meta-llama/Llama-2-13b-chat-hf.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "meta-llama/Llama-2-13b-chat-hf",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-13b-chat-hf"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9296858841579962
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7190075716540155
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.963705740028952
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9990298427566393
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.5722222222222222
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.7451923076923077
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.25
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.4
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.84625
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.7748780182659827
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.91
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8240390360485959
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.20621118012422357
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.41752577319587625
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.3651598803772717
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.7413109757904534
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.43962155328662317
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.5351089588377724
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.5938566552901023
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 1.0
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.989
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.14127747715247715
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.5076363636363636
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/meta-llama/Llama-2-70b-chat-hf.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "meta-llama/Llama-2-70b-chat-hf",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-70b-chat-hf"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9498141943726044
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.7252454731129678
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9409038541427155
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9988042732774585
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.7271428571428571
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.7466666666666666
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.7166666666666667
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.42
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.8825
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.8323999999999999
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.85
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8578968333001394
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.31242236024844716
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.4278350515463918
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.38218886588451806
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.352451665752182
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.5276263233531145
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.6320324740065518
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.6476109215017065
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.8904109589041096
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.988
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.41367733201077794
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.6018181818181818
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/meta-llama/Llama-2-7b-chat-hf.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Llama-2-7b-chat-hf"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9548491021218211
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.6798469370104644
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9646512368479587
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9990993229758505
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.6043478260869565
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.6666666666666667
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.2833333333333333
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.48
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.865
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.7370949580883273
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.9299999999999999
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.7868950408285202
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.14596273291925466
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.5141752577319587
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.26991028295376124
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.32916275270076745
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.4532195083848858
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.4721549636803874
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.5452218430034129
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.9315068493150684
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.992
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.08752953475948229
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.44909090909090915
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/mistralai/Mistral-7B-Instruct-v0.2.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "mistralai/Mistral-7B-Instruct-v0.2",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.2"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.9040460470649646
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.747184860969805
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9607862783311598
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9940165055485872
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.5797872340425532
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.6974789915966386
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.2833333333333333
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.12
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.93375
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.6862379582134368
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 1.0
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8477394941246763
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.4018633540372671
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.3118556701030928
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.23205659075224297
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.659153609956636
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.6681153383230657
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.5910838911835921
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.6407849829351536
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.9863013698630136
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.988
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.019112659939765738
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.5776363636363636
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
results/mistralai/Mixtral-8x7B-Instruct-v0.1.json
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"config": {
|
| 3 |
+
"model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 4 |
+
"model_sha": "main",
|
| 5 |
+
"model_report": "https://compl-ai.org/evaluations/Mixtral-8x7B-Instruct-v0.1"
|
| 6 |
+
},
|
| 7 |
+
"results": {
|
| 8 |
+
"bbq": {
|
| 9 |
+
"aggregate_score": 0.949723812859033
|
| 10 |
+
},
|
| 11 |
+
"bold": {
|
| 12 |
+
"aggregate_score": 0.734901669263283
|
| 13 |
+
},
|
| 14 |
+
"toxicity": {
|
| 15 |
+
"aggregate_score": 0.9581340294360416
|
| 16 |
+
},
|
| 17 |
+
"toxicity_advbench": {
|
| 18 |
+
"aggregate_score": 0.9932040768299395
|
| 19 |
+
},
|
| 20 |
+
"forecasting_consistency": {
|
| 21 |
+
"aggregate_score": 0.6377777777777777
|
| 22 |
+
},
|
| 23 |
+
"self_check_consistency": {
|
| 24 |
+
"aggregate_score": 0.7877094972067039
|
| 25 |
+
},
|
| 26 |
+
"boolq_contrast_robustness": {
|
| 27 |
+
"aggregate_score": 0.35
|
| 28 |
+
},
|
| 29 |
+
"imdb_contrast_robustness": {
|
| 30 |
+
"aggregate_score": 0.47
|
| 31 |
+
},
|
| 32 |
+
"calibration_big_bench": {
|
| 33 |
+
"aggregate_score": 0.9037499999999999
|
| 34 |
+
},
|
| 35 |
+
"calibration_big_bench_i_know": {
|
| 36 |
+
"aggregate_score": 0.8543725760040035
|
| 37 |
+
},
|
| 38 |
+
"decoding_trust": {
|
| 39 |
+
"aggregate_score": 0.9299999999999999
|
| 40 |
+
},
|
| 41 |
+
"hellaswag": {
|
| 42 |
+
"aggregate_score": 0.8755228042222665
|
| 43 |
+
},
|
| 44 |
+
"human_eval": {
|
| 45 |
+
"aggregate_score": 0.475776397515528
|
| 46 |
+
},
|
| 47 |
+
"instruction_goal_hijacking": {
|
| 48 |
+
"aggregate_score": 0.375
|
| 49 |
+
},
|
| 50 |
+
"multiturn_goal_hijacking": {
|
| 51 |
+
"aggregate_score": 0.2561249137336094
|
| 52 |
+
},
|
| 53 |
+
"reddit_bias": {
|
| 54 |
+
"aggregate_score": 0.5429049297532
|
| 55 |
+
},
|
| 56 |
+
"truthful_qa_mc2": {
|
| 57 |
+
"aggregate_score": 0.6458557121081614
|
| 58 |
+
},
|
| 59 |
+
"mmlu": {
|
| 60 |
+
"aggregate_score": 0.7031761857285287
|
| 61 |
+
},
|
| 62 |
+
"ai2_reasoning": {
|
| 63 |
+
"aggregate_score": 0.7090443686006825
|
| 64 |
+
},
|
| 65 |
+
"human_deception": {
|
| 66 |
+
"aggregate_score": 0.8904109589041096
|
| 67 |
+
},
|
| 68 |
+
"memorization": {
|
| 69 |
+
"aggregate_score": 0.983
|
| 70 |
+
},
|
| 71 |
+
"privacy": {
|
| 72 |
+
"aggregate_score": 1.0
|
| 73 |
+
},
|
| 74 |
+
"fairllm": {
|
| 75 |
+
"aggregate_score": 0.05770831155558887
|
| 76 |
+
},
|
| 77 |
+
"mmlu_robustness": {
|
| 78 |
+
"aggregate_score": 0.6941818181818181
|
| 79 |
+
},
|
| 80 |
+
"training_data_suitability": {
|
| 81 |
+
"aggregate_score": null
|
| 82 |
+
},
|
| 83 |
+
"watermarking": {
|
| 84 |
+
"aggregate_score": null
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
}
|
src/display/about.py
CHANGED
|
@@ -15,7 +15,7 @@ class Tasks(Enum):
|
|
| 15 |
task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
|
| 16 |
task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
|
| 17 |
task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
|
| 18 |
-
task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks
|
| 19 |
task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
|
| 20 |
task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
|
| 21 |
task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
|
|
@@ -24,8 +24,8 @@ class Tasks(Enum):
|
|
| 24 |
task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
|
| 25 |
task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
|
| 26 |
task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
|
| 27 |
-
task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage
|
| 28 |
-
task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following
|
| 29 |
task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
|
| 30 |
task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
|
| 31 |
task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
|
|
@@ -35,8 +35,10 @@ class Tasks(Enum):
|
|
| 35 |
task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
|
| 36 |
task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
|
| 37 |
task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
|
| 38 |
-
task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
|
| 39 |
-
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
|
|
@@ -44,9 +46,6 @@ class Tasks(Enum):
|
|
| 44 |
# Your leaderboard name
|
| 45 |
TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
|
| 46 |
|
| 47 |
-
# What does your leaderboard evaluate?
|
| 48 |
-
INTRODUCTION_TEXT = """<p style="font-size: 16px;">COMPL-AI is an open-source compliance-centered evaluation framework for Generative AI models. It includes the ability to evaluate the regulatory technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open-source and open to community contributions. For more information, please visit <a href="https://compl-ai.org" target="_blank">compl-ai.org</a>.</p>"""
|
| 49 |
-
|
| 50 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 51 |
LLM_BENCHMARKS_TEXT = f"""
|
| 52 |
"""
|
|
|
|
| 15 |
task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
|
| 16 |
task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
|
| 17 |
task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
|
| 18 |
+
task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks")
|
| 19 |
task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
|
| 20 |
task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
|
| 21 |
task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
|
|
|
|
| 24 |
task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
|
| 25 |
task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
|
| 26 |
task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
|
| 27 |
+
task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage")
|
| 28 |
+
task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following")
|
| 29 |
task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
|
| 30 |
task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
|
| 31 |
task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
|
|
|
|
| 35 |
task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
|
| 36 |
task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
|
| 37 |
task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
|
| 38 |
+
# task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
|
| 39 |
+
task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
|
| 40 |
+
task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset")
|
| 41 |
+
task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset")
|
| 42 |
|
| 43 |
|
| 44 |
|
|
|
|
| 46 |
# Your leaderboard name
|
| 47 |
TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 50 |
LLM_BENCHMARKS_TEXT = f"""
|
| 51 |
"""
|
src/display/css_html_js.py
CHANGED
|
@@ -1,4 +1,11 @@
|
|
| 1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
/* Hides the final AutoEvalColumn */
|
| 3 |
#llm-benchmark-tab-table table td:last-child,
|
| 4 |
#llm-benchmark-tab-table table th:last-child {
|
|
@@ -21,6 +28,8 @@ table {
|
|
| 21 |
/* Full width space */
|
| 22 |
.gradio-container {
|
| 23 |
max-width: 95%!important;
|
|
|
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
/* Text style and margins */
|
|
@@ -51,6 +60,14 @@ table {
|
|
| 51 |
.tab-buttons button {
|
| 52 |
font-size: 20px;
|
| 53 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
/* Filters style */
|
| 56 |
#filter_type{
|
|
@@ -86,6 +103,153 @@ table {
|
|
| 86 |
border: 0
|
| 87 |
}
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
"""
|
| 90 |
|
| 91 |
get_window_url_params = """
|
|
|
|
| 1 |
custom_css = """
|
| 2 |
+
|
| 3 |
+
:root {
|
| 4 |
+
--block-radius: 0px !important;
|
| 5 |
+
--table-radius: 0px !important;
|
| 6 |
+
--input-radius: 0px !important;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
/* Hides the final AutoEvalColumn */
|
| 10 |
#llm-benchmark-tab-table table td:last-child,
|
| 11 |
#llm-benchmark-tab-table table th:last-child {
|
|
|
|
| 28 |
/* Full width space */
|
| 29 |
.gradio-container {
|
| 30 |
max-width: 95%!important;
|
| 31 |
+
font-family: Open Sans,sans-serif;
|
| 32 |
+
line-height: 1.75em !important;
|
| 33 |
}
|
| 34 |
|
| 35 |
/* Text style and margins */
|
|
|
|
| 60 |
.tab-buttons button {
|
| 61 |
font-size: 20px;
|
| 62 |
}
|
| 63 |
+
.tab-buttons {
|
| 64 |
+
padding-top: 40px;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/* Center Tabs */
|
| 68 |
+
.tab-buttons > div > div:nth-child(2) {
|
| 69 |
+
justify-content: center;
|
| 70 |
+
}
|
| 71 |
|
| 72 |
/* Filters style */
|
| 73 |
#filter_type{
|
|
|
|
| 103 |
border: 0
|
| 104 |
}
|
| 105 |
|
| 106 |
+
#intro {
|
| 107 |
+
padding: 40px 0;
|
| 108 |
+
border: 1px solid var(--border-color-primary);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
#intro > div {
|
| 112 |
+
padding-left: 2em;
|
| 113 |
+
padding-right: 2em;
|
| 114 |
+
min-width: 0px !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.image_header {
|
| 118 |
+
display: flex;
|
| 119 |
+
gap: 15px;
|
| 120 |
+
align-items: center;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
p {
|
| 125 |
+
font-weight: 400;
|
| 126 |
+
font-style: normal;
|
| 127 |
+
font-size: 14px;
|
| 128 |
+
line-height: 1.75em !important;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
.button {
|
| 132 |
+
border: 1px solid #174DA3;
|
| 133 |
+
font-family: IBM Plex Mono,monospace;
|
| 134 |
+
background: none;
|
| 135 |
+
padding: 5px 15px;
|
| 136 |
+
color: #174DA3 !important;
|
| 137 |
+
position: relative;
|
| 138 |
+
font-size: 14px;
|
| 139 |
+
font-weight: 500;
|
| 140 |
+
transition: background-color .15s ease;
|
| 141 |
+
display: inline-flex;
|
| 142 |
+
align-items: center;
|
| 143 |
+
text-decoration: none !important;
|
| 144 |
+
line-height: 1.75em !important;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
.button:hover {
|
| 148 |
+
cursor: pointer;
|
| 149 |
+
background: #EBEEF4;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
#llm-benchmark-tab-table-button {
|
| 153 |
+
border-top-right-radius: unset !important;
|
| 154 |
+
border-top-left-radius: unset !important;
|
| 155 |
+
font-size: 18px !important;
|
| 156 |
+
font-weight: 500 !important;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
label {
|
| 160 |
+
background: unset !important;
|
| 161 |
+
border-radius: 0 !important;
|
| 162 |
+
box-shadow: unset !important;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
label > input {
|
| 166 |
+
border-radius: 0 !important;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
form {
|
| 170 |
+
border-radius: 0 !important;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.principle_header {
|
| 174 |
+
padding: 10px 20px;
|
| 175 |
+
background-color: #EBEEF4;
|
| 176 |
+
border: 1px solid var(--border-color-primary);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.technical_requirements {
|
| 180 |
+
margin-top: -17px;
|
| 181 |
+
gap: 0px;
|
| 182 |
+
align-items: stretch;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
.technical_requirements > div {
|
| 186 |
+
gap: 0px;
|
| 187 |
+
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.technical_requirements > div > div.form {
|
| 191 |
+
border: unset !important;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
.border_mid > div {
|
| 195 |
+
border-left: 1px solid var(--border-color-primary);
|
| 196 |
+
border-right: 1px solid var(--border-color-primary);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.border_bot > div {
|
| 200 |
+
border-left: 1px solid var(--border-color-primary);
|
| 201 |
+
border-right: 1px solid var(--border-color-primary);
|
| 202 |
+
border-bottom: 1px solid var(--border-color-primary);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
@media only screen and (max-width: 1200px) {
|
| 206 |
+
.empty {
|
| 207 |
+
visibility: hidden;
|
| 208 |
+
display: none;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
@media only screen and (max-width: 800px) {
|
| 214 |
+
.empty {
|
| 215 |
+
visibility: hidden;
|
| 216 |
+
display: none;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
#intro {
|
| 220 |
+
flex-direction: column;
|
| 221 |
+
gap: 48px;
|
| 222 |
+
}
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.principle_icon {
|
| 226 |
+
max-height:24px;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.github_icon {
|
| 230 |
+
max-height:24px;
|
| 231 |
+
padding-right: 1em;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
@media (prefers-color-scheme: dark) {
|
| 235 |
+
.principle_header {
|
| 236 |
+
background-color: var(--block-background-fill);
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.button {
|
| 240 |
+
border: 1px solid var(--color-accent);
|
| 241 |
+
color: var(--color-accent) !important;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.principle_icon {
|
| 245 |
+
filter: brightness(2);
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
.github_icon {
|
| 249 |
+
filter: brightness(2);
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
"""
|
| 254 |
|
| 255 |
get_window_url_params = """
|
src/display/utils.py
CHANGED
|
@@ -26,7 +26,7 @@ class ColumnContent:
|
|
| 26 |
## Leaderboard columns
|
| 27 |
auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
|
| 28 |
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
|
| 29 |
-
["model_report", ColumnContent, ColumnContent("
|
| 30 |
]
|
| 31 |
# Init
|
| 32 |
# Scores
|
|
|
|
| 26 |
## Leaderboard columns
|
| 27 |
auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
|
| 28 |
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
|
| 29 |
+
["model_report", ColumnContent, ColumnContent("Report", "markdown", True, never_hidden=True)]
|
| 30 |
]
|
| 31 |
# Init
|
| 32 |
# Scores
|
src/envs.py
CHANGED
|
@@ -6,14 +6,12 @@ from huggingface_hub import HfApi
|
|
| 6 |
TOKEN = os.environ.get("TOKEN", None)
|
| 7 |
|
| 8 |
OWNER = "latticeflow"
|
| 9 |
-
REPO_ID = f"{OWNER}/compl-ai-
|
| 10 |
QUEUE_REPO = f"{OWNER}/requests"
|
| 11 |
-
RESULTS_REPO = f"{OWNER}/results"
|
| 12 |
|
| 13 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 14 |
|
| 15 |
# Local caches
|
| 16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
|
| 17 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
|
| 18 |
|
| 19 |
API = HfApi(token=TOKEN)
|
|
|
|
| 6 |
TOKEN = os.environ.get("TOKEN", None)
|
| 7 |
|
| 8 |
OWNER = "latticeflow"
|
| 9 |
+
# REPO_ID = f"{OWNER}/compl-ai-leaderboard"
|
| 10 |
QUEUE_REPO = f"{OWNER}/requests"
|
|
|
|
| 11 |
|
| 12 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 13 |
|
| 14 |
# Local caches
|
| 15 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
|
|
|
|
| 16 |
|
| 17 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
|
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
def report_hyperlink(link):
|
| 15 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗
|
| 16 |
|
| 17 |
@dataclass
|
| 18 |
class EvalResult:
|
|
@@ -40,7 +40,7 @@ class EvalResult:
|
|
| 40 |
data = json.load(fp)
|
| 41 |
|
| 42 |
config = data.get("config")
|
| 43 |
-
print(json_filepath)
|
| 44 |
# Precision
|
| 45 |
# precision = Precision.from_str(config.get("model_dtype"))
|
| 46 |
|
|
@@ -76,12 +76,12 @@ class EvalResult:
|
|
| 76 |
|
| 77 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
| 78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 79 |
-
print('skip', full_model)
|
| 80 |
results[task.benchmark] = None
|
| 81 |
continue
|
| 82 |
|
| 83 |
-
print(task)
|
| 84 |
-
print(accs)
|
| 85 |
mean_acc = np.mean(accs) # * 100.0
|
| 86 |
results[task.benchmark] = round(mean_acc, 2)
|
| 87 |
|
|
@@ -108,8 +108,8 @@ class EvalResult:
|
|
| 108 |
try:
|
| 109 |
with open(request_file, "r") as f:
|
| 110 |
request = json.load(f)
|
| 111 |
-
print(f"Read Request from {request_file}")
|
| 112 |
-
print(request)
|
| 113 |
# self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
| 114 |
# self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
|
| 115 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
|
@@ -119,7 +119,7 @@ class EvalResult:
|
|
| 119 |
self.num_params = request.get("params", None)
|
| 120 |
self.date = request.get("submitted_time", "")
|
| 121 |
except Exception as e:
|
| 122 |
-
print(e)
|
| 123 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
| 124 |
print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
|
| 125 |
|
|
@@ -158,9 +158,9 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
|
|
| 158 |
requests_path,
|
| 159 |
f"**/request_{model_name}*_eval_request*.json"
|
| 160 |
)
|
| 161 |
-
print(f"Looking up request file(s) with pattern {request_files}")
|
| 162 |
request_files = glob.glob(request_files, recursive=True)
|
| 163 |
-
print(f"Found request file(s) {request_files}")
|
| 164 |
|
| 165 |
# Select correct request file (precision)
|
| 166 |
request_file = ""
|
|
@@ -174,7 +174,7 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
|
|
| 174 |
# and req_content["precision"] == precision.split(".")[-1]
|
| 175 |
):
|
| 176 |
request_file = tmp_request_file
|
| 177 |
-
print(f"Selected {request_file} for model metadata")
|
| 178 |
return request_file
|
| 179 |
|
| 180 |
|
|
@@ -200,10 +200,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 200 |
for model_result_filepath in model_result_filepaths:
|
| 201 |
# Creation of result
|
| 202 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 203 |
-
print()
|
| 204 |
-
print('eval result')
|
| 205 |
-
print(eval_result)
|
| 206 |
-
print()
|
| 207 |
eval_result.update_with_request_file(requests_path)
|
| 208 |
|
| 209 |
# Store results of same eval together
|
|
@@ -217,9 +217,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 217 |
|
| 218 |
for v in eval_results.values():
|
| 219 |
try:
|
| 220 |
-
print()
|
| 221 |
-
print(v)
|
| 222 |
-
print()
|
| 223 |
v.to_dict() # we test if the dict version is complete
|
| 224 |
results.append(v)
|
| 225 |
except KeyError: # not all eval values present
|
|
|
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
def report_hyperlink(link):
|
| 15 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Report</a>' if link else "N/A"
|
| 16 |
|
| 17 |
@dataclass
|
| 18 |
class EvalResult:
|
|
|
|
| 40 |
data = json.load(fp)
|
| 41 |
|
| 42 |
config = data.get("config")
|
| 43 |
+
# print(json_filepath)
|
| 44 |
# Precision
|
| 45 |
# precision = Precision.from_str(config.get("model_dtype"))
|
| 46 |
|
|
|
|
| 76 |
|
| 77 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
| 78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 79 |
+
# print('skip', full_model)
|
| 80 |
results[task.benchmark] = None
|
| 81 |
continue
|
| 82 |
|
| 83 |
+
# print(task)
|
| 84 |
+
# print(accs)
|
| 85 |
mean_acc = np.mean(accs) # * 100.0
|
| 86 |
results[task.benchmark] = round(mean_acc, 2)
|
| 87 |
|
|
|
|
| 108 |
try:
|
| 109 |
with open(request_file, "r") as f:
|
| 110 |
request = json.load(f)
|
| 111 |
+
# print(f"Read Request from {request_file}")
|
| 112 |
+
# print(request)
|
| 113 |
# self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
| 114 |
# self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
|
| 115 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
|
|
|
| 119 |
self.num_params = request.get("params", None)
|
| 120 |
self.date = request.get("submitted_time", "")
|
| 121 |
except Exception as e:
|
| 122 |
+
# print(e)
|
| 123 |
self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
|
| 124 |
print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
|
| 125 |
|
|
|
|
| 158 |
requests_path,
|
| 159 |
f"**/request_{model_name}*_eval_request*.json"
|
| 160 |
)
|
| 161 |
+
# print(f"Looking up request file(s) with pattern {request_files}")
|
| 162 |
request_files = glob.glob(request_files, recursive=True)
|
| 163 |
+
# print(f"Found request file(s) {request_files}")
|
| 164 |
|
| 165 |
# Select correct request file (precision)
|
| 166 |
request_file = ""
|
|
|
|
| 174 |
# and req_content["precision"] == precision.split(".")[-1]
|
| 175 |
):
|
| 176 |
request_file = tmp_request_file
|
| 177 |
+
# print(f"Selected {request_file} for model metadata")
|
| 178 |
return request_file
|
| 179 |
|
| 180 |
|
|
|
|
| 200 |
for model_result_filepath in model_result_filepaths:
|
| 201 |
# Creation of result
|
| 202 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 203 |
+
# print()
|
| 204 |
+
# print('eval result')
|
| 205 |
+
# print(eval_result)
|
| 206 |
+
# print()
|
| 207 |
eval_result.update_with_request_file(requests_path)
|
| 208 |
|
| 209 |
# Store results of same eval together
|
|
|
|
| 217 |
|
| 218 |
for v in eval_results.values():
|
| 219 |
try:
|
| 220 |
+
# print()
|
| 221 |
+
# print(v)
|
| 222 |
+
# print()
|
| 223 |
v.to_dict() # we test if the dict version is complete
|
| 224 |
results.append(v)
|
| 225 |
except KeyError: # not all eval values present
|
src/populate.py
CHANGED
|
@@ -11,7 +11,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 13 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 14 |
-
print(all_data_json)
|
| 15 |
df = pd.DataFrame.from_records(all_data_json)
|
| 16 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 17 |
df = df[cols].round(decimals=2)
|
|
@@ -40,7 +40,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
| 40 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
| 41 |
for sub_entry in sub_entries:
|
| 42 |
file_path = os.path.join(save_path, entry, sub_entry)
|
| 43 |
-
print(file_path)
|
| 44 |
with open(file_path) as fp:
|
| 45 |
data = json.load(fp)
|
| 46 |
|
|
|
|
| 11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 12 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 13 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 14 |
+
# print(all_data_json)
|
| 15 |
df = pd.DataFrame.from_records(all_data_json)
|
| 16 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 17 |
df = df[cols].round(decimals=2)
|
|
|
|
| 40 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
| 41 |
for sub_entry in sub_entries:
|
| 42 |
file_path = os.path.join(save_path, entry, sub_entry)
|
| 43 |
+
# print(file_path)
|
| 44 |
with open(file_path) as fp:
|
| 45 |
data = json.load(fp)
|
| 46 |
|