Skip to content

Commit

Permalink
Merge branch 'main' into Document-Custom-Model-Files
Browse files Browse the repository at this point in the history
  • Loading branch information
ParagEkbote authored Jan 23, 2025
2 parents 5815119 + 0ab63d0 commit 73af85b
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 1 deletion.
121 changes: 121 additions & 0 deletions community_tasks/french_evals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# ruff: noqa: F405, F403, F401
"""
Custom evaluation tasks for lighteval.
This file generally creates just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
This module implements tasks for the french specific datasets
See : https://huggingface.co/fr-gouv-coordination-ia
"""

import random

import numpy as np
from aenum import extend_enum

import lighteval.tasks.extended.ifeval.instructions_registry as instructions_registry
from lighteval.metrics.metrics import Metrics, SampleLevelMetric
from lighteval.metrics.utils.metric_utils import (
MetricCategory,
MetricUseCase,
SampleLevelMetricGrouping,
)
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.extended.ifeval.main import ifeval_metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


# Ifeval-fr prompt function
def prompt_ifeval_fr(line, task_name: str = None):
return Doc(
task_name=task_name,
query=line["prompt"],
choices=[""],
gold_index=0,
instruction="",
specific={"instructions_id_list": line["instruction_id_list"], "kwargs": line["kwargs"]},
)


# qpqa-fr prompt function
def prompt_gpqa_fr(line, task_name: str = None):
gold_index = random.randint(0, 3)
choices = [line["Réponse incorrecte 1"], line["Réponse incorrecte 2"], line["Réponse incorrecte 3"]]
choices.insert(gold_index, line["Réponse correcte"])

instruction = "Choisissez la réponse correcte aux questions suivantes.\n\n"

query = f"Question: {line['Question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
query += "Answer: "
return Doc(
task_name=task_name,
query=f"{instruction}{query}",
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=instruction,
)


# IFEVal-fr task


ifeval_fr_task = LightevalTaskConfig(
name="ifeval-fr",
prompt_function=prompt_ifeval_fr, # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
suite=["community"],
hf_repo="fr-gouv-coordination-ia/IFEval-fr",
hf_subset="default",
metric=[ifeval_metrics],
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select="random_sampling",
generation_size=1280,
stop_sequence=[], # no stop sequence, will use eot token
version="0.1", # select your metric in Metrics
)

# GPQA-fr task
gpqa_fr_task = LightevalTaskConfig(
name="gpqa-fr",
suite=["community"],
prompt_function=prompt_gpqa_fr,
hf_repo="fr-gouv-coordination-ia/gpqa-fr",
hf_subset="default",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=1,
metric=[Metrics.loglikelihood_acc],
stop_sequence=["\n"],
trust_dataset=True,
version=0,
)

# STORE YOUR EVALS
TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ multilingual = [
"jieba", # for chinese tokenizer
"pyvi", # for vietnamese tokenizer
]
math = ["latex2sympy2_extended>=0.9.0"]
math = ["latex2sympy2_extended>=0.9.1"]

[project.urls]
Homepage = "https://github.com/huggingface/lighteval"
Expand Down
47 changes: 47 additions & 0 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,28 @@ def __init__(

self.public = public

@property
def results(self):
config_general = asdict(self.general_config_logger)
# We remove the config from logging, which contains context/accelerator objects
config_general.pop("config")
results = {
"config_general": config_general,
"results": self.metrics_logger.metric_aggregated,
"versions": self.versions_logger.versions,
"config_tasks": self.task_config_logger.tasks_configs,
"summary_tasks": self.details_logger.compiled_details,
"summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
}
return results

@property
def details(self):
return {
task_name: [asdict(detail) for detail in task_details]
for task_name, task_details in self.details_logger.details.items()
}

def save(self) -> None:
"""Saves the experiment information and results to files, and to the hub if requested."""
logger.info("Saving experiment tracker")
Expand Down Expand Up @@ -281,6 +303,31 @@ def push_to_hub(

self.recreate_metadata_card(repo_id)

def push_results_to_hub(self, repo_id: str, path_in_repo: str, private: bool | None = None):
repo_id = repo_id if "/" in repo_id else f"{self.hub_results_org}/{repo_id}"
private = private if private is not None else not self.public
self.api.create_repo(repo_id, private=private, repo_type="dataset", exist_ok=True)
results_json = json.dumps(self.results, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)
self.api.upload_file(
repo_id=repo_id,
path_or_fileobj=results_json.encode(),
path_in_repo=path_in_repo,
repo_type="dataset",
)

def push_details_to_hub(self, repo_id: str, path_in_repo: str, private: bool | None = None):
repo_id = repo_id if "/" in repo_id else f"{self.hub_results_org}/{repo_id}"
private = private if private is not None else not self.public
self.api.create_repo(repo_id, private=private, repo_type="dataset", exist_ok=True)
for task_name, details in self.details:
details_json = "\n".join([json.dumps(detail) for detail in details])
self.api.upload_file(
repo_id=repo_id,
path_or_fileobj=details_json.encode(),
path_in_repo=path_in_repo.format(task_name=task_name),
repo_type="dataset",
)

def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901
"""Fully updates the details repository metadata card for the currently evaluated model
Expand Down
27 changes: 27 additions & 0 deletions tests/metrics/test_extractive_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,34 @@ def test_math_extraction_edge_cases(gold, pred, expected):
r"To find the product \( ab \) where \( a = 2012_3 \) and \( b = 201_3 \), we first convert these base-three numbers to base ten. For \( a = 2012_3 \): \[ a = 2 \cdot 3^3 + 0 \cdot 3^2 + 1 \cdot 3^1 + 2 \cdot 3^0 = 2 \cdot 27 + 0 \cdot 9 + 1 \cdot 3 + 2 \cdot 1 = 54 + 0 + 3 + 2 = 59_{10} \] For \( b = 201_3 \): \[ b = 2 \cdot 3^2 + 0 \cdot 3^1 + 1 \cdot 3^0 = 2 \cdot 9 + 0 \cdot 3 + 1 \cdot 1 = 18 + 0 + 1 = 19_{10} \] Now, calculate the product in base ten: \[ ab = 59 \times 19 \] Perform the multiplication: \[ 59 \times 19 = 59 \times (20 - 1) = 59 \times 20 - 59 \times 1 = 1180 - 59 = 1121 \] Next, convert \( 1121_{10} \) to base three. We do this by dividing by 3 and recording the remainders: \[ 1121 \div 3 = 373 \quad \text{remainder } 2 \] \[ 373 \div 3 = 124 \quad \text{remainder } 1 \] \[ 124 \div 3 = 41 \quad \text{remainder } 1 \] \[ 41 \div 3 = 13 \quad \text{remainder } 2 \] \[ 13 \div 3 = 4 \quad \text{remainder } 1 \] \[ 4 \div 3 = 1 \quad \text{remainder } 1 \] \[ 1 \div 3 = 0 \quad \text{remainder } 1 \] Reading the remainders from last to first, we find: \[ 1121_{10} = 1112122_3 \] Thus, the product \( ab \) expressed in the base-three number system is \(\boxed{1112122_3}\).",
0,
),
(
r"\(\boxed{\text{C}}\).",
r"$\boxed{\text{(C)}}.$",
1,
),
(
r" So the answer is: \[ \boxed{11111111100} \]",
r"is $\boxed{11,\! 111,\! 111,\! 100}$",
1,
),
(
r" So the answer is: \[ \boxed{32349} \]",
r"is $\boxed{32,\! 349}$",
1,
),
(
r"Thus, the domain of the function \( f(x) \) is: \[ \boxed{(2, 12) \cup (12, 102)} \]",
r"Thus, the answer is $x \in \boxed{(2,12) \cup (12,102)}$",
1,
),
],
)
def test_math_extraction_additional_cases(gold, pred, expected):
assert compare_strings(gold, pred, match_types=["latex", "expr"]) == expected


# text{C} Qwen correct
# 11111111100 Qwen correct
# Interval(2, oo) qwen incorrect
# text{west} qwen incorrect
# 32349, 32,\!348 qwen incorrect

0 comments on commit 73af85b

Please sign in to comment.