Merge branch 'main' into Document-Custom-Model-Files

huggingface · Jan 23, 2025 · 73af85b · 73af85b
2 parents 5815119 + 0ab63d0
commit 73af85b
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 1 deletion.
diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py
@@ -0,0 +1,121 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# ruff: noqa: F405, F403, F401
+"""
+Custom evaluation tasks for lighteval.
+
+This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
+
+This module implements tasks for the french specific datasets
+See : https://huggingface.co/fr-gouv-coordination-ia
+"""
+
+import random
+
+import numpy as np
+from aenum import extend_enum
+
+import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
+from lighteval.metrics.metrics import Metrics, SampleLevelMetric
+from lighteval.metrics.utils.metric_utils import (
+    MetricCategory,
+    MetricUseCase,
+    SampleLevelMetricGrouping,
+)
+from lighteval.tasks.default_prompts import LETTER_INDICES
+from lighteval.tasks.extended.ifeval.main import ifeval_metrics
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+
+
+# Ifeval-fr prompt function
+def prompt_ifeval_fr(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"],
+        choices=[""],
+        gold_index=0,
+        instruction="",
+        specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
+    )
+
+
+# qpqa-fr prompt function
+def prompt_gpqa_fr(line, task_name: str = None):
+    gold_index = random.randint(0, 3)
+    choices = [line["Réponse incorrecte 1"], line["Réponse incorrecte 2"], line["Réponse incorrecte 3"]]
+    choices.insert(gold_index, line["Réponse correcte"])
+
+    instruction = "Choisissez la réponse correcte aux questions suivantes.\n\n"
+
+    query = f"Question: {line['Question']}\n"
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
+    query += "Answer: "
+    return Doc(
+        task_name=task_name,
+        query=f"{instruction}{query}",
+        choices=LETTER_INDICES[: len(choices)],
+        gold_index=gold_index,
+        instruction=instruction,
+    )
+
+
+# IFEVal-fr task
+
+
+ifeval_fr_task = LightevalTaskConfig(
+    name="ifeval-fr",
+    prompt_function=prompt_ifeval_fr,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["community"],
+    hf_repo="fr-gouv-coordination-ia/IFEval-fr",
+    hf_subset="default",
+    metric=[ifeval_metrics],
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="train",
+    few_shots_select="random_sampling",
+    generation_size=1280,
+    stop_sequence=[],  # no stop sequence, will use eot token
+    version="0.1",  # select your metric in Metrics
+)
+
+# GPQA-fr task
+gpqa_fr_task = LightevalTaskConfig(
+    name="gpqa-fr",
+    suite=["community"],
+    prompt_function=prompt_gpqa_fr,
+    hf_repo="fr-gouv-coordination-ia/gpqa-fr",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=1,
+    metric=[Metrics.loglikelihood_acc],
+    stop_sequence=["\n"],
+    trust_dataset=True,
+    version=0,
+)
+
+# STORE YOUR EVALS
+TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task]
diff --git a/pyproject.toml b/pyproject.toml
@@ -109,7 +109,7 @@ multilingual = [
     "jieba", # for chinese tokenizer
     "pyvi", # for vietnamese tokenizer
 ]
-math = ["latex2sympy2_extended>=0.9.0"]
+math = ["latex2sympy2_extended>=0.9.1"]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -149,6 +149,28 @@ def __init__(
 
         self.public = public
 
+    @property
+    def results(self):
+        config_general = asdict(self.general_config_logger)
+        # We remove the config from logging, which contains context/accelerator objects
+        config_general.pop("config")
+        results = {
+            "config_general": config_general,
+            "results": self.metrics_logger.metric_aggregated,
+            "versions": self.versions_logger.versions,
+            "config_tasks": self.task_config_logger.tasks_configs,
+            "summary_tasks": self.details_logger.compiled_details,
+            "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
+        }
+        return results
+
+    @property
+    def details(self):
+        return {
+            task_name: [asdict(detail) for detail in task_details]
+            for task_name, task_details in self.details_logger.details.items()
+        }
+
     def save(self) -> None:
         """Saves the experiment information and results to files, and to the hub if requested."""
         logger.info("Saving experiment tracker")
@@ -281,6 +303,31 @@ def push_to_hub(
 
         self.recreate_metadata_card(repo_id)
 
+    def push_results_to_hub(self, repo_id: str, path_in_repo: str, private: bool | None = None):
+        repo_id = repo_id if "/" in repo_id else f"{self.hub_results_org}/{repo_id}"
+        private = private if private is not None else not self.public
+        self.api.create_repo(repo_id, private=private, repo_type="dataset", exist_ok=True)
+        results_json = json.dumps(self.results, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)
+        self.api.upload_file(
+            repo_id=repo_id,
+            path_or_fileobj=results_json.encode(),
+            path_in_repo=path_in_repo,
+            repo_type="dataset",
+        )
+
+    def push_details_to_hub(self, repo_id: str, path_in_repo: str, private: bool | None = None):
+        repo_id = repo_id if "/" in repo_id else f"{self.hub_results_org}/{repo_id}"
+        private = private if private is not None else not self.public
+        self.api.create_repo(repo_id, private=private, repo_type="dataset", exist_ok=True)
+        for task_name, details in self.details:
+            details_json = "\n".join([json.dumps(detail) for detail in details])
+            self.api.upload_file(
+                repo_id=repo_id,
+                path_or_fileobj=details_json.encode(),
+                path_in_repo=path_in_repo.format(task_name=task_name),
+                repo_type="dataset",
+            )
+
     def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
         """Fully updates the details repository metadata card for the currently evaluated model
 

diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py
@@ -949,7 +949,34 @@ def test_math_extraction_edge_cases(gold, pred, expected):
             r"To find the product \( ab \) where \( a = 2012_3 \) and \( b = 201_3 \), we first convert these base-three numbers to base ten. For \( a = 2012_3 \): \[ a = 2 \cdot 3^3 + 0 \cdot 3^2 + 1 \cdot 3^1 + 2 \cdot 3^0 = 2 \cdot 27 + 0 \cdot 9 + 1 \cdot 3 + 2 \cdot 1 = 54 + 0 + 3 + 2 = 59_{10} \] For \( b = 201_3 \): \[ b = 2 \cdot 3^2 + 0 \cdot 3^1 + 1 \cdot 3^0 = 2 \cdot 9 + 0 \cdot 3 + 1 \cdot 1 = 18 + 0 + 1 = 19_{10} \] Now, calculate the product in base ten: \[ ab = 59 \times 19 \] Perform the multiplication: \[ 59 \times 19 = 59 \times (20 - 1) = 59 \times 20 - 59 \times 1 = 1180 - 59 = 1121 \] Next, convert \( 1121_{10} \) to base three. We do this by dividing by 3 and recording the remainders: \[ 1121 \div 3 = 373 \quad \text{remainder } 2 \] \[ 373 \div 3 = 124 \quad \text{remainder } 1 \] \[ 124 \div 3 = 41 \quad \text{remainder } 1 \] \[ 41 \div 3 = 13 \quad \text{remainder } 2 \] \[ 13 \div 3 = 4 \quad \text{remainder } 1 \] \[ 4 \div 3 = 1 \quad \text{remainder } 1 \] \[ 1 \div 3 = 0 \quad \text{remainder } 1 \] Reading the remainders from last to first, we find: \[ 1121_{10} = 1112122_3 \] Thus, the product \( ab \) expressed in the base-three number system is \(\boxed{1112122_3}\).",
             0,
         ),
+        (
+            r"\(\boxed{\text{C}}\).",
+            r"$\boxed{\text{(C)}}.$",
+            1,
+        ),
+        (
+            r" So the answer is: \[ \boxed{11111111100} \]",
+            r"is $\boxed{11,\! 111,\! 111,\! 100}$",
+            1,
+        ),
+        (
+            r" So the answer is: \[ \boxed{32349} \]",
+            r"is $\boxed{32,\! 349}$",
+            1,
+        ),
+        (
+            r"Thus, the domain of the function \( f(x) \) is: \[ \boxed{(2, 12) \cup (12, 102)} \]",
+            r"Thus, the answer is $x \in \boxed{(2,12) \cup (12,102)}$",
+            1,
+        ),
     ],
 )
 def test_math_extraction_additional_cases(gold, pred, expected):
     assert compare_strings(gold, pred, match_types=["latex", "expr"]) == expected
+
+
+# text{C} Qwen correct
+# 11111111100 Qwen correct
+# Interval(2, oo) qwen incorrect
+# text{west} qwen incorrect
+# 32349, 32,\!348 qwen incorrect