compl-ai-board

Running

App Files Files Community

pavol-bielik commited on Oct 28, 2024

Commit

b615923

1 Parent(s): d799cb2

add principles and technical requirements mapping

Browse files

Files changed (20) hide show

README.md +1 -1
app.py +336 -140
requirements.txt +2 -2
results/01-ai/Yi-34B-Chat.json +87 -0
results/Claude3Opus/result_Claude3Opus.json +87 -0
results/Qwen/Qwen1.5-72B-Chat.json +87 -0
results/google/gemma-2-9b.json +81 -0
results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json +87 -0
results/gpt-4-1106-preview/result_gpt-4-1106-preview.json +87 -0
results/meta-llama/Llama-2-13b-chat-hf.json +87 -0
results/meta-llama/Llama-2-70b-chat-hf.json +87 -0
results/meta-llama/Llama-2-7b-chat-hf.json +87 -0
results/mistralai/Mistral-7B-Instruct-v0.2.json +87 -0
results/mistralai/Mixtral-8x7B-Instruct-v0.1.json +87 -0
src/display/about.py +7 -8
src/display/css_html_js.py +164 -0
src/display/utils.py +1 -1
src/envs.py +1 -3
src/leaderboard/read_evals.py +18 -18
src/populate.py +2 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🥇
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.37.1
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.4.0
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import gradio as gr
 import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
@@ -25,23 +25,11 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-import time
-import requests
-def restart_space():
-    restart = False
-    while not restart:
-        try:
-            API.restart_space(repo_id=REPO_ID, token=TOKEN)
-        except requests.exceptions.ConnectionError as e:
-            print("Restart failed. Re-trying...")
-            time.sleep(30)
-            continue
-        restart = True
 try:
@@ -50,14 +38,8 @@ try:
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
-    )
-except Exception:
-    restart_space()
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
@@ -83,6 +65,23 @@ def update_table(
     return df
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
@@ -139,112 +138,307 @@ def filter_models(
     return filtered_df
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                            show_label=False,
-                            elem_id="search-bar",
-                        )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden and not c.dummy
-                            ],
-                            value=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Select columns to show",
-                            elem_id="column-select",
-                            interactive=True,
-                        )
-            with gr.Row():
-                with gr.Column(min_width=250):
-                    # with gr.Box(elem_id="box-filter"):
-                    filter_columns_type = gr.CheckboxGroup(
-                        label="Model types",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
-                        interactive=True,
-                        elem_id="filter-columns-type",
-                    )
-                    # filter_columns_precision = gr.CheckboxGroup(
-                    #     label="Precision",
-                    #     choices=[i.value.name for i in Precision],
-                    #     value=[i.value.name for i in Precision],
-                    #     interactive=True,
-                    #     elem_id="filter-columns-precision",
-                    # )
-                    # filter_columns_size = gr.CheckboxGroup(
-                    #     label="Model sizes (in billions of parameters)",
-                    #     choices=list(NUMERIC_INTERVALS.keys()),
-                    #     value=list(NUMERIC_INTERVALS.keys()),
-                    #     interactive=True,
-                    #     elem_id="filter-columns-size",
-                    # )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                    ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-                column_widths=["2%", "20%", "10%", "10%", "12%"]
             )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
             )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    filter_columns_type,
-                    # filter_columns_precision,
-                    # filter_columns_size,
-                    search_bar,
-                ],
-                leaderboard_table,
             )
-            for selector in [shown_columns, filter_columns_type,
-                             ]:
-                selector.change(
-                    update_table,
-                    [
-                        hidden_leaderboard_table_for_search,
-                        shown_columns,
-                        filter_columns_type,
-                        # filter_columns_precision,
-                        # filter_columns_size,
-                        # deleted_models_visibility,
-                        search_bar,
-                    ],
-                    leaderboard_table,
-                    queue=True,
-                )
-        with gr.TabItem("🚀 Request evaluation ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -309,22 +503,24 @@ with demo:
                 ],
                 submission_result,
             )
-    with gr.Row():
-        with gr.Accordion("📖 FAQ", open=False):
-            with gr.Column(min_width=250):
-                        gr.Markdown("""
-            #### What does N/A score mean?
-            An N/A score means that it was not possible to evaluate the benchmark for a given model.
-            This can happen for multiple reasons, such as:
-            - The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
-            - The model API refuses to provide any answer,
-            - We do not have access to the training data.
-                                        """)
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
@@ -335,7 +531,7 @@ with demo:
                 show_copy_button=True,
             )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

+import functools
+from pathlib import Path
 import gradio as gr
 import pandas as pd
 from huggingface_hub import snapshot_download
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     EVALUATION_QUEUE_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
     WeightType,
     Precision
 )
+from src.envs import EVAL_REQUESTS_PATH, QUEUE_REPO
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+EVAL_RESULTS_PATH = str(Path(__file__).resolve().parent / "results")
 try:
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
+    # restart_space()
+    pass
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
     return df
+def update_principles_table(
+        df,
+        *args: list,
+) -> pd.DataFrame:
+    columns = [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+    for shown_column in args:
+        if isinstance(shown_column, gr.components.CheckboxGroup):
+            columns.extend(shown_column.value)
+        else:
+            columns.extend(shown_column)
+    # dummy column for querying (not shown)
+    columns.append("model_name_for_query")
+    return df[columns]
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
     return filtered_df
+BENCHMARKS_PER_CATEGORY = {
+    "Robustness and Predictability": [
+        "MMLU: Robustness",
+        "BoolQ Contrast Set",
+        "IMDB Contrast Set",
+        "Monotonicity Checks",
+        "Self-Check Consistency",
+    ],
+    "Cyberattack Resilience": [
+        "Goal Hijacking and Prompt Leakage",
+        "Rule Following"
+    ],
+    "Training Data Suitability": [
+        "Toxicity of the Dataset",
+        "Bias of the Dataset"
+    ],
+    "No Copyright Infringement": [
+        "Copyrighted Material Memorization"
+    ],
+    "User Privacy Protection": [
+        "PII Extraction by Association"
+    ],
+    "Capabilities, Performance, and Limitations": [
+        "General Knowledge: MMLU",
+        "Reasoning: AI2 Reasoning Challenge",
+        "Common Sense Reasoning: HellaSwag",
+        "Truthfulness: TruthfulQA MC2",
+        "Coding: HumanEval"
+    ],
+    "Interpretability": ["Logit Calibration: BIG-Bench", "Self-Assessment: TriviaQA"],
+    "Disclosure of AI": ["Denying Human Presence"],
+    "Traceability": ["Watermark Reliability & Robustness"],
+    "Representation — Absence of Bias": ["Representation Bias: RedditBias", "Prejudiced Answers: BBQ", "Biased Completions: BOLD"],
+    "Fairness — Absence of Discrimination":["Income Fairness: DecodingTrust", "Recommendation Consistency: FaiRLLM"],
+    "Harmful Content and Toxicity": ["Toxic Completions of Benign Text: RealToxicityPrompts", "Following Harmful Instructions: AdvBench"]
+}
+def _wrap_link(value: str, url: str) -> str:
+    return f"<a href={url} target='_blank'>{value}</a>"
+TEXT_PER_CATEGORY = {
+    "Robustness and Predictability": f"We evaluate the model on state-of-the-art benchmarks that measure its robustness under various input alterations [{_wrap_link('1', 'https://aclanthology.org/2020.findings-emnlp.117/')}], and the level of consistency in its answers [{_wrap_link('2', 'https://arxiv.org/abs/2306.09983')}, {_wrap_link('3', 'https://arxiv.org/abs/2305.15852')}].",
+    "Cyberattack Resilience": f"We consider the concrete threats concerning just the LLM in isolation, focusing on its resilience to jailbreaks and prompt injection attacks [{_wrap_link('1', 'https://arxiv.org/abs/2311.01011')}, {_wrap_link('2', 'https://arxiv.org/abs/2311.04235')}, {_wrap_link('3', 'https://arxiv.org/abs/2312.02119')}].",
+    "Training Data Suitability": "We evaluate the adequacy of the dataset [1], aiming to assess the potential of an LLM trained on this data to exhibit toxic or discriminatory behavior.",
+    "No Copyright Infringement": "We check if the model can be made to directly regurgitate content that is subject to the copyright of a third person.",
+    "User Privacy Protection": "We focus on cases of user privacy violation by the LLM itself, evaluating the model’s ability to recover personal identifiable information that may have been included in the training data.",
+    "Capabilities, Performance, and Limitations": "To provide an overarching view, we assess the capabilities and limitations of the AI system by evaluating its performance on a wide range of tasks. We evaluate the model on widespread research benchmarks covering general knowledge [1], reasoning [2,3], truthfulness [4], and coding ability [5].",
+    "Interpretability": "The large body of machine learning interpretability research is often not easily applicable to large language models. While more work in this direction is needed, we use the existing easily-applicable methods to evaluate the model’s ability to reason about its own correctness [1], and the degree to which the probabilities it outputs can be interpreted [3,4].",
+    "Disclosure of AI": "We require the language model to consistently deny that it is a human.",
+    "Traceability": "We require the presence of language model watermarking [1,2], and evaluate its viability, combining several important requirements that such schemes must satisfy to be practical.",
+    "Representation — Absence of Bias": "We evaluate the tendency of the LLM to produce biased outputs, on three popular bias benchmarks [1,2,3].",
+    "Fairness — Absence of Discrimination": "We evaluate the model’s tendency to behave in a discriminatory way by comparing its behavior on different protected groups, using prominent fairness benchmarks [1,2].",
+    "Harmful Content and Toxicity": "We evaluate the models’ tendency to produce harmful or toxic content, leveraging two recent evaluation tools, RealToxicityPrompts and AdvBench [1,2]."
+}
+CATEGORIES_PER_PRINCIPLE = {
+    "Technical Robustness and Safety": ["Robustness and Predictability", "Cyberattack Resilience"],
+    "Privacy & Data Governance": ["Training Data Suitability", "No Copyright Infringement", "User Privacy Protection"],
+    "Transparency": ["Capabilities, Performance, and Limitations", "Interpretability", "Disclosure of AI", "Traceability"],
+    "Diversity, Non-discrimination & Fairness": ["Representation — Absence of Bias", "Fairness — Absence of Discrimination"],
+    "Social & Environmental Well-being": ["Harmful Content and Toxicity"]
+}
+ICON_PER_PRINCIPLE = {
+    "Technical Robustness and Safety": "https://compl-ai.org/icon_technical_robustness_and_safety.svg",
+    "Privacy & Data Governance": "https://compl-ai.org/icon_privacy_and_data_governance.svg",
+    "Transparency": "https://compl-ai.org/icon_transparency.svg",
+    "Diversity, Non-discrimination & Fairness": "https://compl-ai.org/icon_diversity_fairness.svg",
+    "Social & Environmental Well-being": "https://compl-ai.org/icon_social_environmental.svg",
+}
+def generate_benchmarks(principle: str):
+    with gr.Row():
+        gr.HTML(f"""
+        <h3 class="image_header principle_header"><img src="{ICON_PER_PRINCIPLE[principle]}" class="principle_icon"/>EU AI Act Principle: {principle}</h3>
+        """)
+    categories = CATEGORIES_PER_PRINCIPLE[principle]
+    with gr.Row(elem_classes=["technical_requirements", "border_mid"]):
+        for category in categories:
+            with gr.Column():
+                gr.HTML(
+                    f"""
+                    <div style="padding: 10px 20px;">
+                    <h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />{category}</h3>
+                    <p>{TEXT_PER_CATEGORY[category]}</p>
+                    </div>
+                    """
+                )
+    shown_columns = []
+    with gr.Row(elem_classes=["technical_requirements", "border_bot"]):
+        for category in categories:
+            with gr.Column():
+                shown_column = gr.CheckboxGroup(
+                    show_label=False,
+                    choices=BENCHMARKS_PER_CATEGORY[category],
+                    value=BENCHMARKS_PER_CATEGORY[category],
+                    interactive=True,
+                    # elem_id="filter-columns-type",
+                )
+                shown_columns.append(shown_column)
+    with gr.Row():
+        df = update_principles_table(leaderboard_df, *shown_columns)
+        type_per_column = {c.name: c.type for c in fields(AutoEvalColumn)}
+        datatypes = [type_per_column[name] for name in df.columns]
+        leaderboard_table = gr.components.Dataframe(
+            value=df,
+            headers=df.columns.tolist(),
+            datatype=datatypes,
+            elem_id="leaderboard-table",
+            interactive=False,
+            visible=True,
+        )
+        for shown_column in shown_columns:
+            shown_column.change(
+                fn=functools.partial(update_principles_table, leaderboard_df),
+                inputs=shown_columns,
+                outputs=leaderboard_table,
+                # queue=True,
             )
+# Allows clicking on the full table column to trigger sorting
+custom_js = """
+function clickableTableHeaders() {
+    document.querySelectorAll(".table > thead > tr > th").forEach(th => {
+      th.addEventListener("click", () => {
+          const sortButton = th.querySelector(".sort-button"); // Selects the first child with class "sort-button"
+          if (sortButton) {
+            sortButton.click(); // Triggers the click event on the "sort-button" element
+          }
+      });
+    });
+    // Select all elements with the .table class
+    const tableElements = document.querySelectorAll('.table');
+    // Callback function to execute when mutations are observed
+    const mutationCallback = (mutationsList) => {
+        mutationsList.forEach((mutation) => {
+            if (mutation.target.nodeName == "TH" && mutation.addedNodes.length > 0) {
+                mutation.target.addEventListener("click", () => {
+                  const sortButton = mutation.target.querySelector(".sort-button"); // Selects the first child with class "sort-button"
+                  if (sortButton) {
+                    sortButton.click(); // Triggers the click event on the "sort-button" element
+                  }
+              });
+            }
+        });
+    };
+    // Options for the observer (which mutations to observe)
+    const observerOptions = {
+      childList: true,         // Watch for additions/removals of child nodes
+      subtree: true            // Watch for changes in descendants as well
+    };
+    // Create an instance of MutationObserver and pass in the callback function
+    const observer = new MutationObserver(mutationCallback);
+    // Observe each .table element
+    tableElements.forEach((tableElement) => {
+      observer.observe(tableElement, observerOptions);
+    });
+}
+"""
+demo = gr.Blocks(
+    css=custom_css,
+    theme=gr.themes.Default(
+        font=gr.themes.GoogleFont("Open Sans", weights=(400, 500, 600))
+    ),
+    js=custom_js,
+)
+with demo:
+    gr.HTML(TITLE)
+    with gr.Row(elem_id="intro"):
+        with gr.Column(scale=1, min_width=20, elem_classes="empty"):
+            pass
+        with gr.Column(scale=5):
+            gr.HTML(
+                """
+                <h3 class="image_header"><img src="https://compl-ai.org/hex.svg" style="max-height:24px;" />Technical Interpretation of the EU AI Act</h3>
+                <p>We have interpreted the high-level regulatory requirements of the EU AI Act as concrete technical requirements. We further group requirements within six EU AI Act principles and label them as GPAI, GPAI+SR (Systemic Risk), and HR (High-Risk).</p>
+                <br/>
+                <a href="https://compl-ai.org/interpretation" class="button" target="_blank">Explore the Interpretation</a>
+                """
             )
+        with gr.Column(scale=5):
+            gr.HTML(
+                """
+                <h3 class="image_header"><img src="https://compl-ai.org/checkmark.png" style="max-height:24px;" />Open-Source Benchmarking Suite</h3>
+                <p>The framework includes the ability to evaluate the technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open to community contributions.</p>
+                <br/>
+                <a href="https://github.com/compl-ai/compl-ai" class="button" target="_blank"><img src="https://compl-ai.org/icons/github-mark.svg" class="github_icon">GitHub Repo</a>
+                """
             )
+        with gr.Column(scale=1, min_width=20, elem_classes="empty"):
+            pass
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Results", elem_id="llm-benchmark-tab-table", id=0):
+            for principle in CATEGORIES_PER_PRINCIPLE.keys():
+                generate_benchmarks(principle)
+            ###
+            # with gr.Row():
+            #     shown_columns = gr.CheckboxGroup(
+            #         choices=[
+            #             c.name
+            #             for c in fields(AutoEvalColumn)
+            #             if not c.hidden and not c.never_hidden and not c.dummy
+            #         ],
+            #         value=[
+            #             c.name
+            #             for c in fields(AutoEvalColumn)
+            #             if c.displayed_by_default and not c.hidden and not c.never_hidden
+            #         ],
+            #         label="Select columns to show",
+            #         elem_id="column-select",
+            #         interactive=True,
+            #     )
+            #
+            # with gr.Row():
+            #     # with gr.Box(elem_id="box-filter"):
+            #     filter_columns_type = gr.CheckboxGroup(
+            #         label="Model types",
+            #         choices=[t.to_str() for t in ModelType],
+            #         value=[t.to_str() for t in ModelType],
+            #         interactive=True,
+            #         elem_id="filter-columns-type",
+            #     )
+            #
+            # with gr.Row():
+            #     search_bar = gr.Textbox(
+            #         placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+            #         show_label=False,
+            #         elem_id="search-bar",
+            #     )
+            #     # x = gr.Checkbox(show_label=False, label="foo")
+            #
+            # with gr.Row():
+            #     # print(shown_columns.value)
+            #     leaderboard_table = gr.components.Dataframe(
+            #         value=leaderboard_df[
+            #             [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+            #             + shown_columns.value
+            #             ],
+            #         headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+            #         datatype=TYPES,
+            #         elem_id="leaderboard-table",
+            #         interactive=False,
+            #         visible=True,
+            #         # column_widths=["2%", "30%", "10%", "10%", "12%"]
+            #     )
+            #
+            #     # Dummy leaderboard for handling the case when the user uses backspace key
+            #     hidden_leaderboard_table_for_search = gr.components.Dataframe(
+            #         value=original_df[COLS],
+            #         headers=COLS,
+            #         datatype=TYPES,
+            #         visible=False,
+            #     )
+            #     search_bar.submit(
+            #         update_table,
+            #         [
+            #             hidden_leaderboard_table_for_search,
+            #             shown_columns,
+            #             filter_columns_type,
+            #             # filter_columns_precision,
+            #             # filter_columns_size,
+            #             search_bar,
+            #         ],
+            #         leaderboard_table,
+            #     )
+            #     for selector in [shown_columns, filter_columns_type,
+            #                      ]:
+            #         selector.change(
+            #             update_table,
+            #             [
+            #                 hidden_leaderboard_table_for_search,
+            #                 shown_columns,
+            #                 filter_columns_type,
+            #                 # filter_columns_precision,
+            #                 # filter_columns_size,
+            #                 # deleted_models_visibility,
+            #                 search_bar,
+            #             ],
+            #             leaderboard_table,
+            #             queue=True,
+            #         )
+        with gr.TabItem("🚀 Request Evaluation ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 ],
                 submission_result,
             )
+        with gr.TabItem("📖 FAQ ", elem_id="llm-benchmark-tab-table", id=4):
+            with gr.Row():
+                # with gr.Accordion("📖 FAQ", open=True):
+                #     with gr.Column(min_width=250):
+                gr.Markdown("""
+                    #### What does N/A score mean?
+                    An N/A score means that it was not possible to evaluate the benchmark for a given model.
+                    This can happen for multiple reasons, such as:
+                    - The benchmark requires access to model logits, but the model API doesn't provide them (or only provides them for specific strings),
+                    - The model API refuses to provide any answer,
+                    - We do not have access to the training data. """
+                )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
                 show_copy_button=True,
             )
+# scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
+# scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

requirements.txt CHANGED Viewed

@@ -2,8 +2,8 @@ APScheduler==3.10.1
 black==23.11.0
 click==8.1.3
 datasets==2.14.5
-gradio==4.4.0
-gradio_client==0.7.0
 huggingface-hub>=0.18.0
 matplotlib==3.7.1
 numpy==1.24.2

 black==23.11.0
 click==8.1.3
 datasets==2.14.5
+gradio==5.4.0
+gradio_client==1.4.2
 huggingface-hub>=0.18.0
 matplotlib==3.7.1
 numpy==1.24.2

results/01-ai/Yi-34B-Chat.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "01-ai/Yi-34B-Chat",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Yi-34B-Chat"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9307063195147172
+        },
+        "bold": {
+            "aggregate_score": 0.6834721511354611
+        },
+        "toxicity": {
+            "aggregate_score": 0.921922202494338
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9927596254738791
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6688888888888889
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.8031496062992126
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.5666666666666667
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.84
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.89125
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.8039534592768672
+        },
+        "decoding_trust": {
+            "aggregate_score": 1.0
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8403704441346346
+        },
+        "human_eval": {
+            "aggregate_score": 0.32298136645962733
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.538659793814433
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.5828559926386013
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.6196885060946251
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.5538416743593192
+        },
+        "mmlu": {
+            "aggregate_score": 0.7496795328300812
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6544368600682594
+        },
+        "human_deception": {
+            "aggregate_score": 0.3561643835616438
+        },
+        "memorization": {
+            "aggregate_score": 0.989
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.23216608444613182
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.7225454545454546
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/Claude3Opus/result_Claude3Opus.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "Claude3Opus",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/claude-3-opus-20240229"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9660153175746249
+        },
+        "bold": {
+            "aggregate_score": 0.757401370038459
+        },
+        "toxicity": {
+            "aggregate_score": null
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9932693207159621
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.7785714285714285
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.8484107579462102
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": null
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": null
+        },
+        "calibration_big_bench": {
+            "aggregate_score": null
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": null
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.825
+        },
+        "hellaswag": {
+            "aggregate_score": 0.954
+        },
+        "human_eval": {
+            "aggregate_score": 0.848
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.8402061855670103
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.7557223372440763
+        },
+        "reddit_bias": {
+            "aggregate_score": null
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": null
+        },
+        "mmlu": {
+            "aggregate_score": 0.868
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.964
+        },
+        "human_deception": {
+            "aggregate_score": 1.0
+        },
+        "memorization": {
+            "aggregate_score": 1.0
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.18613375956573336
+        },
+        "mmlu_robustness": {
+            "aggregate_score": null
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/Qwen/Qwen1.5-72B-Chat.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "Qwen/Qwen1.5-72B-Chat",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Qwen1.5-72B-Chat"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9599664585022551
+        },
+        "bold": {
+            "aggregate_score": 0.7200614975210675
+        },
+        "toxicity": {
+            "aggregate_score": 0.9695524579515152
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9976900071559403
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6695121951219511
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.8449367088607596
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.8
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.48
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.78625
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.42761166020267727
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.6800000000000002
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8662617008564031
+        },
+        "human_eval": {
+            "aggregate_score": 0.5726708074534161
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.45360824742268047
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.48560501495284103
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.8261042236875162
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.6389413688457826
+        },
+        "mmlu": {
+            "aggregate_score": 0.7768836348098561
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6825938566552902
+        },
+        "human_deception": {
+            "aggregate_score": 0.726027397260274
+        },
+        "memorization": {
+            "aggregate_score": 0.988
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.06008018267012608
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.7525454545454544
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/google/gemma-2-9b.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+    "config": {
+        "model_name": "google/gemma-2-9b",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Google-gemma-2-9b"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9839603777871013
+        },
+        "bold": {
+            "aggregate_score": 0.7370530984096585
+        },
+        "toxicity": {
+            "aggregate_score": 0.8414627564046196
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9254440618284906
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.46973684210526323
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.5833333333333334
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.51
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.80375
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.013649443262854999
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.94
+        },
+        "hellaswag": {
+            "aggregate_score": 0.35590519816769567
+        },
+        "human_eval": {
+            "aggregate_score": 0.0
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.4858247422680413
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.499626178974005
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.9823398039376345
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.4643375169376196
+        },
+        "mmlu": {
+            "aggregate_score": 0.26356644352656317
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.2295221843003413
+        },
+        "human_deception": {
+            "aggregate_score": 0.9863013698630136
+        },
+        "memorization": {
+            "aggregate_score": 0.996
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.0
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.26527272727272727
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        }
+    }
+}

results/gpt-3.5-turbo-0125/result_gpt-3.5-turbo-0125.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "gpt-3.5-turbo-0125",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/gpt-3.5-turbo-0125"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.8798721047562755
+        },
+        "bold": {
+            "aggregate_score": 0.7320262917581274
+        },
+        "toxicity": {
+            "aggregate_score": 0.9388454625027057
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9905250918854458
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6739999999999999
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.8208955223880597
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.65
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.545
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.95625
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.9075691229826097
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.835
+        },
+        "hellaswag": {
+            "aggregate_score": 0.855
+        },
+        "human_eval": {
+            "aggregate_score": 0.762
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": null
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.6549516908212559
+        },
+        "reddit_bias": {
+            "aggregate_score": null
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": null
+        },
+        "mmlu": {
+            "aggregate_score": 0.6811707733941034
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.930976430976431
+        },
+        "human_deception": {
+            "aggregate_score": 0.589041095890411
+        },
+        "memorization": {
+            "aggregate_score": 0.992
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.07798395569162107
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.6952727272727273
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/gpt-4-1106-preview/result_gpt-4-1106-preview.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "gpt-4-1106-preview",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/gpt-4-1106-preview"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9825415205108432
+        },
+        "bold": {
+            "aggregate_score": 0.7385998936629464
+        },
+        "toxicity": {
+            "aggregate_score": 0.969292480968859
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.996174184883085
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.7755813953488372
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.8656716417910448
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.8666666666666667
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.985
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 1.0
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.9538095833854623
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.8799999999999999
+        },
+        "hellaswag": {
+            "aggregate_score": 0.953
+        },
+        "human_eval": {
+            "aggregate_score": 0.8370000000000001
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.6572164948453608
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.882740970784449
+        },
+        "reddit_bias": {
+            "aggregate_score": null
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": null
+        },
+        "mmlu": {
+            "aggregate_score": 0.805
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.964
+        },
+        "human_deception": {
+            "aggregate_score": 0.9726027397260274
+        },
+        "memorization": {
+            "aggregate_score": 0.998
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.12940239570245562
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.8049090909090908
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/meta-llama/Llama-2-13b-chat-hf.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "meta-llama/Llama-2-13b-chat-hf",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Llama-2-13b-chat-hf"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9296858841579962
+        },
+        "bold": {
+            "aggregate_score": 0.7190075716540155
+        },
+        "toxicity": {
+            "aggregate_score": 0.963705740028952
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9990298427566393
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.5722222222222222
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.7451923076923077
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.25
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.4
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.84625
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.7748780182659827
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.91
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8240390360485959
+        },
+        "human_eval": {
+            "aggregate_score": 0.20621118012422357
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.41752577319587625
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.3651598803772717
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.7413109757904534
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.43962155328662317
+        },
+        "mmlu": {
+            "aggregate_score": 0.5351089588377724
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.5938566552901023
+        },
+        "human_deception": {
+            "aggregate_score": 1.0
+        },
+        "memorization": {
+            "aggregate_score": 0.989
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.14127747715247715
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.5076363636363636
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/meta-llama/Llama-2-70b-chat-hf.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "meta-llama/Llama-2-70b-chat-hf",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Llama-2-70b-chat-hf"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9498141943726044
+        },
+        "bold": {
+            "aggregate_score": 0.7252454731129678
+        },
+        "toxicity": {
+            "aggregate_score": 0.9409038541427155
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9988042732774585
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.7271428571428571
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.7466666666666666
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.7166666666666667
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.42
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.8825
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.8323999999999999
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.85
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8578968333001394
+        },
+        "human_eval": {
+            "aggregate_score": 0.31242236024844716
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.4278350515463918
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.38218886588451806
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.352451665752182
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.5276263233531145
+        },
+        "mmlu": {
+            "aggregate_score": 0.6320324740065518
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6476109215017065
+        },
+        "human_deception": {
+            "aggregate_score": 0.8904109589041096
+        },
+        "memorization": {
+            "aggregate_score": 0.988
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.41367733201077794
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.6018181818181818
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/meta-llama/Llama-2-7b-chat-hf.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Llama-2-7b-chat-hf"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9548491021218211
+        },
+        "bold": {
+            "aggregate_score": 0.6798469370104644
+        },
+        "toxicity": {
+            "aggregate_score": 0.9646512368479587
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9990993229758505
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6043478260869565
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.6666666666666667
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.2833333333333333
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.48
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.865
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.7370949580883273
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.9299999999999999
+        },
+        "hellaswag": {
+            "aggregate_score": 0.7868950408285202
+        },
+        "human_eval": {
+            "aggregate_score": 0.14596273291925466
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.5141752577319587
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.26991028295376124
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.32916275270076745
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.4532195083848858
+        },
+        "mmlu": {
+            "aggregate_score": 0.4721549636803874
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.5452218430034129
+        },
+        "human_deception": {
+            "aggregate_score": 0.9315068493150684
+        },
+        "memorization": {
+            "aggregate_score": 0.992
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.08752953475948229
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.44909090909090915
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/mistralai/Mistral-7B-Instruct-v0.2.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.2"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.9040460470649646
+        },
+        "bold": {
+            "aggregate_score": 0.747184860969805
+        },
+        "toxicity": {
+            "aggregate_score": 0.9607862783311598
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9940165055485872
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.5797872340425532
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.6974789915966386
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.2833333333333333
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.12
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.93375
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.6862379582134368
+        },
+        "decoding_trust": {
+            "aggregate_score": 1.0
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8477394941246763
+        },
+        "human_eval": {
+            "aggregate_score": 0.4018633540372671
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.3118556701030928
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.23205659075224297
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.659153609956636
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.6681153383230657
+        },
+        "mmlu": {
+            "aggregate_score": 0.5910838911835921
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.6407849829351536
+        },
+        "human_deception": {
+            "aggregate_score": 0.9863013698630136
+        },
+        "memorization": {
+            "aggregate_score": 0.988
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.019112659939765738
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.5776363636363636
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

results/mistralai/Mixtral-8x7B-Instruct-v0.1.json ADDED Viewed

	@@ -0,0 +1,87 @@

+{
+    "config": {
+        "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "model_sha": "main",
+        "model_report": "https://compl-ai.org/evaluations/Mixtral-8x7B-Instruct-v0.1"
+    },
+    "results": {
+        "bbq": {
+            "aggregate_score": 0.949723812859033
+        },
+        "bold": {
+            "aggregate_score": 0.734901669263283
+        },
+        "toxicity": {
+            "aggregate_score": 0.9581340294360416
+        },
+        "toxicity_advbench": {
+            "aggregate_score": 0.9932040768299395
+        },
+        "forecasting_consistency": {
+            "aggregate_score": 0.6377777777777777
+        },
+        "self_check_consistency": {
+            "aggregate_score": 0.7877094972067039
+        },
+        "boolq_contrast_robustness": {
+            "aggregate_score": 0.35
+        },
+        "imdb_contrast_robustness": {
+            "aggregate_score": 0.47
+        },
+        "calibration_big_bench": {
+            "aggregate_score": 0.9037499999999999
+        },
+        "calibration_big_bench_i_know": {
+            "aggregate_score": 0.8543725760040035
+        },
+        "decoding_trust": {
+            "aggregate_score": 0.9299999999999999
+        },
+        "hellaswag": {
+            "aggregate_score": 0.8755228042222665
+        },
+        "human_eval": {
+            "aggregate_score": 0.475776397515528
+        },
+        "instruction_goal_hijacking": {
+            "aggregate_score": 0.375
+        },
+        "multiturn_goal_hijacking": {
+            "aggregate_score": 0.2561249137336094
+        },
+        "reddit_bias": {
+            "aggregate_score": 0.5429049297532
+        },
+        "truthful_qa_mc2": {
+            "aggregate_score": 0.6458557121081614
+        },
+        "mmlu": {
+            "aggregate_score": 0.7031761857285287
+        },
+        "ai2_reasoning": {
+            "aggregate_score": 0.7090443686006825
+        },
+        "human_deception": {
+            "aggregate_score": 0.8904109589041096
+        },
+        "memorization": {
+            "aggregate_score": 0.983
+        },
+        "privacy": {
+            "aggregate_score": 1.0
+        },
+        "fairllm": {
+            "aggregate_score": 0.05770831155558887
+        },
+        "mmlu_robustness": {
+            "aggregate_score": 0.6941818181818181
+        },
+        "training_data_suitability": {
+            "aggregate_score": null
+        },
+        "watermarking": {
+            "aggregate_score": null
+        }
+    }
+}

src/display/about.py CHANGED Viewed

@@ -15,7 +15,7 @@ class Tasks(Enum):
     task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
     task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
     task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
-    task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks: Forecasting Consistency")
     task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
     task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
     task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
@@ -24,8 +24,8 @@ class Tasks(Enum):
     task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
     task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
     task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
-    task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage: TensorTrust")
-    task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following: LLM RuLES")
     task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
     task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
     task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
@@ -35,8 +35,10 @@ class Tasks(Enum):
     task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
     task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
     task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
-    task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
-    task25 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
@@ -44,9 +46,6 @@ class Tasks(Enum):
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """<p style="font-size: 16px;">COMPL-AI is an open-source compliance-centered evaluation framework for Generative AI models. It includes the ability to evaluate the regulatory technical requirements on a benchmarking suite containing 27 SOTA LLM benchmarks. The benchmark suite and technical interpretations are both open-source and open to community contributions. For more information, please visit <a href="https://compl-ai.org" target="_blank">compl-ai.org</a>.</p>"""
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 """

     task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
     task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
     task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
+    task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks")
     task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
     task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
     task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
     task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
     task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
     task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
+    task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage")
+    task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following")
     task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
     task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
     task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
     task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
     task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
     task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
+    # task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
+    task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
+    task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset")
+    task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset")
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 """

src/display/css_html_js.py CHANGED Viewed

@@ -1,4 +1,11 @@
 custom_css = """
 /* Hides the final AutoEvalColumn */
 #llm-benchmark-tab-table table td:last-child,
 #llm-benchmark-tab-table table th:last-child {
@@ -21,6 +28,8 @@ table {
 /* Full width space */
 .gradio-container {
     max-width: 95%!important;
 }
 /* Text style and margins */
@@ -51,6 +60,14 @@ table {
 .tab-buttons button {
     font-size: 20px;
 }
 /* Filters style */
 #filter_type{
@@ -86,6 +103,153 @@ table {
     border: 0
 }
 """
 get_window_url_params = """

 custom_css = """
+:root {
+  --block-radius: 0px !important;
+  --table-radius: 0px !important;
+  --input-radius: 0px !important;
+}
 /* Hides the final AutoEvalColumn */
 #llm-benchmark-tab-table table td:last-child,
 #llm-benchmark-tab-table table th:last-child {
 /* Full width space */
 .gradio-container {
     max-width: 95%!important;
+    font-family: Open Sans,sans-serif;
+    line-height: 1.75em !important;
 }
 /* Text style and margins */
 .tab-buttons button {
     font-size: 20px;
 }
+.tab-buttons {
+    padding-top: 40px;
+}
+/* Center Tabs */
+.tab-buttons > div > div:nth-child(2) {
+    justify-content: center;
+}
 /* Filters style */
 #filter_type{
     border: 0
 }
+#intro {
+    padding: 40px 0;
+    border: 1px solid var(--border-color-primary);
+}
+#intro > div {
+    padding-left: 2em;
+    padding-right: 2em;
+    min-width: 0px !important;
+}
+.image_header {
+    display: flex;
+    gap: 15px;
+    align-items: center;
+}
+p {
+    font-weight: 400;
+    font-style: normal;
+    font-size: 14px;
+    line-height: 1.75em !important;
+}
+.button {
+    border: 1px solid #174DA3;
+    font-family: IBM Plex Mono,monospace;
+    background: none;
+    padding: 5px 15px;
+    color: #174DA3 !important;
+    position: relative;
+    font-size: 14px;
+    font-weight: 500;
+    transition: background-color .15s ease;
+    display: inline-flex;
+    align-items: center;
+    text-decoration: none !important;
+    line-height: 1.75em !important;
+}
+.button:hover {
+    cursor: pointer;
+    background: #EBEEF4;
+}
+#llm-benchmark-tab-table-button {
+    border-top-right-radius: unset !important;
+    border-top-left-radius: unset !important;
+    font-size: 18px !important;
+    font-weight: 500 !important;
+}
+label {
+    background: unset !important;
+    border-radius: 0 !important;
+    box-shadow: unset !important;
+}
+label > input {
+    border-radius: 0 !important;
+}
+form {
+    border-radius: 0 !important;
+}
+.principle_header {
+    padding: 10px 20px;
+    background-color: #EBEEF4;
+    border: 1px solid var(--border-color-primary);
+}
+.technical_requirements {
+    margin-top: -17px;
+    gap: 0px;
+    align-items: stretch;
+}
+.technical_requirements > div {
+    gap: 0px;
+}
+.technical_requirements > div > div.form {
+    border: unset !important;
+}
+.border_mid > div {
+    border-left: 1px solid var(--border-color-primary);
+    border-right: 1px solid var(--border-color-primary);
+}
+.border_bot > div {
+    border-left: 1px solid var(--border-color-primary);
+    border-right: 1px solid var(--border-color-primary);
+    border-bottom: 1px solid var(--border-color-primary);
+}
+@media only screen and (max-width: 1200px) {
+  .empty {
+    visibility: hidden;
+    display: none;
+  }
+}
+@media only screen and (max-width: 800px) {
+  .empty {
+    visibility: hidden;
+    display: none;
+  }
+  #intro {
+    flex-direction: column;
+    gap: 48px;
+  }
+}
+.principle_icon {
+    max-height:24px;
+}
+.github_icon {
+    max-height:24px;
+    padding-right: 1em;
+}
+@media (prefers-color-scheme: dark) {
+  .principle_header {
+    background-color: var(--block-background-fill);
+  }
+  .button {
+    border: 1px solid var(--color-accent);
+    color: var(--color-accent) !important;
+  }
+  .principle_icon {
+    filter: brightness(2);
+  }
+  .github_icon {
+    filter: brightness(2);
+  }
+}
 """
 get_window_url_params = """

src/display/utils.py CHANGED Viewed

@@ -26,7 +26,7 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
                          ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
-                         ["model_report", ColumnContent, ColumnContent("Full Report", "markdown", True, never_hidden=False)]
                          ]
 # Init
 # Scores

 ## Leaderboard columns
 auto_eval_column_dict = [["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)],
                          ["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)],
+                         ["model_report", ColumnContent, ColumnContent("Report", "markdown", True, never_hidden=True)]
                          ]
 # Init
 # Scores

src/envs.py CHANGED Viewed

@@ -6,14 +6,12 @@ from huggingface_hub import HfApi
 TOKEN = os.environ.get("TOKEN", None)
 OWNER = "latticeflow"
-REPO_ID = f"{OWNER}/compl-ai-board"
 QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
 API = HfApi(token=TOKEN)

 TOKEN = os.environ.get("TOKEN", None)
 OWNER = "latticeflow"
+# REPO_ID = f"{OWNER}/compl-ai-leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "requests")
 API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
 from src.submission.check_validity import is_model_on_hub
 def report_hyperlink(link):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Full Report</a>' if link else "N/A"
 @dataclass
 class EvalResult:
@@ -40,7 +40,7 @@ class EvalResult:
             data = json.load(fp)
         config = data.get("config")
-        print(json_filepath)
         # Precision
         # precision = Precision.from_str(config.get("model_dtype"))
@@ -76,12 +76,12 @@ class EvalResult:
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
-                print('skip', full_model)
                 results[task.benchmark] = None
                 continue
-            print(task)
-            print(accs)
             mean_acc = np.mean(accs) # * 100.0
             results[task.benchmark] = round(mean_acc, 2)
@@ -108,8 +108,8 @@ class EvalResult:
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
-                print(f"Read Request from {request_file}")
-                print(request)
                 #            self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
             # self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
             self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
@@ -119,7 +119,7 @@ class EvalResult:
             self.num_params = request.get("params", None)
             self.date = request.get("submitted_time", "")
         except Exception as e:
-            print(e)
             self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
             print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
@@ -158,9 +158,9 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
         requests_path,
         f"**/request_{model_name}*_eval_request*.json"
     )
-    print(f"Looking up request file(s) with pattern {request_files}")
     request_files = glob.glob(request_files, recursive=True)
-    print(f"Found request file(s) {request_files}")
     # Select correct request file (precision)
     request_file = ""
@@ -174,7 +174,7 @@ def get_request_file_for_model(requests_path, model_name, revision=""):
                 # and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
-    print(f"Selected {request_file} for model metadata")
     return request_file
@@ -200,10 +200,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        print()
-        print('eval result')
-        print(eval_result)
-        print()
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
@@ -217,9 +217,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for v in eval_results.values():
         try:
-            print()
-            print(v)
-            print()
             v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present

 from src.submission.check_validity import is_model_on_hub
 def report_hyperlink(link):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">🔗 Report</a>' if link else "N/A"
 @dataclass
 class EvalResult:
             data = json.load(fp)
         config = data.get("config")
+        # print(json_filepath)
         # Precision
         # precision = Precision.from_str(config.get("model_dtype"))
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
+                # print('skip', full_model)
                 results[task.benchmark] = None
                 continue
+            # print(task)
+            # print(accs)
             mean_acc = np.mean(accs) # * 100.0
             results[task.benchmark] = round(mean_acc, 2)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
+                # print(f"Read Request from {request_file}")
+                # print(request)
                 #            self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
             # self.model_type = ModelType.from_str("open" if self.still_on_hub else "closed")
             self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
             self.num_params = request.get("params", None)
             self.date = request.get("submitted_time", "")
         except Exception as e:
+            # print(e)
             self.model_type = ModelType.from_str("open" if "/" in self.full_model and "openai" not in self.full_model else "closed")
             print(f"Could not find request file ({requests_path}) for {self.org}/{self.model}")
         requests_path,
         f"**/request_{model_name}*_eval_request*.json"
     )
+    # print(f"Looking up request file(s) with pattern {request_files}")
     request_files = glob.glob(request_files, recursive=True)
+    # print(f"Found request file(s) {request_files}")
     # Select correct request file (precision)
     request_file = ""
                 # and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
+    # print(f"Selected {request_file} for model metadata")
     return request_file
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        # print()
+        # print('eval result')
+        # print(eval_result)
+        # print()
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
     for v in eval_results.values():
         try:
+            # print()
+            # print(v)
+            # print()
             v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present

src/populate.py CHANGED Viewed

@@ -11,7 +11,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
-    print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
@@ -40,7 +40,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
-                print(file_path)
                 with open(file_path) as fp:
                     data = json.load(fp)

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
+    # print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
             sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
+                # print(file_path)
                 with open(file_path) as fp:
                     data = json.load(fp)