Skip to content

Commit

Permalink
Merge branch 'main' into Document-Custom-Model-Files
Browse files Browse the repository at this point in the history
  • Loading branch information
ParagEkbote authored Jan 30, 2025
2 parents 73af85b + 94fc5a2 commit ab38bd5
Show file tree
Hide file tree
Showing 18 changed files with 457 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ multilingual = [
"jieba", # for chinese tokenizer
"pyvi", # for vietnamese tokenizer
]
math = ["latex2sympy2_extended>=0.9.1"]
math = ["latex2sympy2_extended>=0.9.3"]

[project.urls]
Homepage = "https://github.com/huggingface/lighteval"
Expand Down
46 changes: 43 additions & 3 deletions src/lighteval/logging/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import torch
from datasets import Dataset, load_dataset
from datasets.utils.metadata import MetadataConfigs
from fsspec import url_to_fs
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url

from lighteval.logging.info_loggers import (
Expand All @@ -53,6 +52,11 @@
if is_nanotron_available():
from nanotron.config import GeneralArgs # type: ignore

try:
from fsspec import url_to_fs
except ImportError:
from fsspec.core import url_to_fs


class EnhancedJSONEncoder(json.JSONEncoder):
"""
Expand Down Expand Up @@ -231,9 +235,45 @@ def save_results(self, date_id: str, results_dict: dict):
with self.fs.open(output_results_file, "w") as f:
f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False))

def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
def _get_details_sub_folder(self, date_id: str):
output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name
output_dir_details_sub_folder = output_dir_details / date_id
if date_id in ["first", "last"]:
# Get all folders in output_dir_details
if not self.fs.exists(output_dir_details):
raise FileNotFoundError(f"Details directory {output_dir_details} does not exist")

# List all folders and filter out files
folders = [f["name"] for f in self.fs.listdir(output_dir_details) if f["type"] == "directory"]

if not folders:
raise FileNotFoundError(f"No timestamp folders found in {output_dir_details}")

# Parse timestamps and get first or last
date_id = max(folders) if date_id == "last" else min(folders)
return output_dir_details / date_id

def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str, Dataset]:
output_dir_details_sub_folder = self._get_details_sub_folder(date_id)
logger.info(f"Loading details from {output_dir_details_sub_folder}")
date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest
details_datasets = {}
for file in self.fs.glob(str(output_dir_details_sub_folder / f"details_*_{date_id}.parquet")):
task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "")
if "|".join(task_name.split("|")[:-1]) not in task_names:
logger.info(f"Skipping {task_name} because it is not in the task_names list")
continue
dataset = load_dataset("parquet", data_files=file, split="train")
details_datasets[task_name] = dataset

for task_name in task_names:
if not any(task_name.startswith(task_name) for task_name in details_datasets.keys()):
raise ValueError(
f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({date_id})."
)
return details_datasets

def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
output_dir_details_sub_folder = self._get_details_sub_folder(date_id)
self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True)
logger.info(f"Saving details to {output_dir_details_sub_folder}")
for task_name, dataset in details_datasets.items():
Expand Down
4 changes: 4 additions & 0 deletions src/lighteval/main_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def accelerate( # noqa C901
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
Expand Down Expand Up @@ -137,6 +140,7 @@ def accelerate( # noqa C901
max_samples=max_samples,
use_chat_template=use_chat_template,
system_prompt=system_prompt,
load_responses_from_details_date_id=load_responses_from_details_date_id,
)

# TODO (nathan): better handling of model_args
Expand Down
12 changes: 12 additions & 0 deletions src/lighteval/main_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ def inference_endpoint(
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
Expand Down Expand Up @@ -247,6 +250,7 @@ def inference_endpoint(
max_samples=max_samples,
use_chat_template=use_chat_template,
system_prompt=system_prompt,
load_responses_from_details_date_id=load_responses_from_details_date_id,
)
pipeline = Pipeline(
tasks=tasks,
Expand Down Expand Up @@ -292,6 +296,9 @@ def tgi(
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
Expand Down Expand Up @@ -355,6 +362,7 @@ def tgi(
max_samples=max_samples,
use_chat_template=use_chat_template,
system_prompt=system_prompt,
load_responses_from_details_date_id=load_responses_from_details_date_id,
)
pipeline = Pipeline(
tasks=tasks,
Expand Down Expand Up @@ -400,6 +408,9 @@ def litellm(
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
Expand Down Expand Up @@ -464,6 +475,7 @@ def litellm(
max_samples=max_samples,
use_chat_template=use_chat_template,
system_prompt=system_prompt,
load_responses_from_details_date_id=load_responses_from_details_date_id,
)
pipeline = Pipeline(
tasks=tasks,
Expand Down
4 changes: 4 additions & 0 deletions src/lighteval/main_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def vllm(
num_fewshot_seeds: Annotated[
int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
] = 1,
load_responses_from_details_date_id: Annotated[
Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
] = None,
# === saving ===
output_dir: Annotated[
str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
Expand Down Expand Up @@ -124,6 +127,7 @@ def vllm(
max_samples=max_samples,
use_chat_template=use_chat_template,
system_prompt=system_prompt,
load_responses_from_details_date_id=load_responses_from_details_date_id,
)

if model_args.endswith(".yaml"):
Expand Down
12 changes: 10 additions & 2 deletions src/lighteval/metrics/dynamic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def multilingual_extractive_match_metric(
pred_extraction_target: Sequence[ExtractionTarget] = (ExprExtractionConfig(),),
aggregation_function: Callable[[list[float]], float] = max,
fallback_mode: Literal["no_fallback", "first_match"] = "first_match",
extraction_mode: Literal["first_match", "any_match"] = "any_match",
precision: int = 6,
) -> SampleLevelMetric:
"""Creates a language-aware extractive match metric that extracts answers from the model's output.
Expand All @@ -215,6 +216,10 @@ def multilingual_extractive_match_metric(
How to perform extraction. Defaults to "first_match".
- "no_fallback": Only use first successfully parsed matches
- "first_match": Use the first successfully parsed match + first match irregardless the parsing success
extraction_mode: Literal["first_match", "any_match"]
- "first_match": Only tries to extract the first regex match if it fails no other matches are tried
- "any_match": Tries to extract any regex match
precision: int
Number of decimal places to use when comparing numerical values. Defaults to 6.
Expand All @@ -240,9 +245,12 @@ def sample_level_fn(golds: list[str], predictions: list[str], formatted_doc: Doc
pred_extraction_regexes = get_extraction_regexes(formatted_doc, pred_extraction_target, language)

extracted_predictions = [
extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode) for pred in predictions
extract_target_from_pred(pred, pred_extraction_regexes, fallback_mode, extraction_mode)
for pred in predictions
]
extracted_golds = [
extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode, extraction_mode) for gold in golds
]
extracted_golds = [extract_target_from_pred(gold, gold_extraction_regexes, fallback_mode) for gold in golds]

# Assert on empty gold and warn on empty pred
if any(len(g) == 0 for g in extracted_golds):
Expand Down
21 changes: 21 additions & 0 deletions src/lighteval/metrics/imports/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
21 changes: 21 additions & 0 deletions src/lighteval/metrics/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
40 changes: 32 additions & 8 deletions src/lighteval/metrics/utils/extractive_match_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
# SOFTWARE.

import re
from dataclasses import dataclass
from dataclasses import dataclass, field
from functools import lru_cache
from itertools import groupby
from typing import Literal, Sequence
from typing import Any, Literal, Sequence

import sympy
from sympy import Basic, MatrixBase, Number
Expand All @@ -39,17 +39,33 @@
from lighteval.utils.timeout import timeout


@requires_latex2sympy2_extended
def latex_normalization_config_default_factory():
from latex2sympy2_extended.latex2sympy2 import NormalizationConfig

return NormalizationConfig(
basic_latex=True,
units=True,
malformed_operators=True,
nits=True,
boxed=True,
equations=True,
)


@dataclass(frozen=True)
class LatexExtractionConfig:
"""Config for extracting latex from the prediction.
Attributes:
try_extract_without_anchor (bool): Whether to try extracting latex without requiring specific anchors like "answer:" or "final answer is"
enforce_boxed_match (bool): Whether to also consider extracting from plain \boxed{...} expressions
boxed_match_priority (int): Priority of the boxed match regex (-1 never, 0 first, 55 after final answer: anchor, etc...)
normalization_config (latex2sympy2_extended.latex2sympy2.NormalizationConfig): Normalization config to use for latex extraction
"""

try_extract_without_anchor: bool = True
enforce_boxed_match: bool = True
boxed_match_priority: int = 55
normalization_config: Any = field(default_factory=latex_normalization_config_default_factory)


@dataclass(frozen=True)
Expand Down Expand Up @@ -187,9 +203,8 @@ def lazy_latex_regex(latex_config: LatexExtractionConfig, language: Language) ->
if latex_config.try_extract_without_anchor:
regexes.append((latex_re, 300))

# This ensures that boxed is matched right after the final answer xxxx
if latex_config.enforce_boxed_match:
regexes.append((latex_boxed, 55))
if latex_config.boxed_match_priority >= 0:
regexes.append((latex_boxed, latex_config.boxed_match_priority))

return [(re.compile(pattern, re.DOTALL), priority) for pattern, priority in regexes]

Expand Down Expand Up @@ -387,6 +402,7 @@ def extract_target_from_pred(
pred: str,
target_res: list[tuple[list[tuple[re.Pattern[str], int]], ExtractionTarget]],
fallback_mode: Literal["no_fallback", "first_match"] = "no_fallback",
extraction_mode: Literal["first_match", "any_match"] = "any_match",
):
"""Extracts targets from a prediction string using regex patterns.
Returns first sucesffuly extracted match.
Expand All @@ -397,6 +413,9 @@ def extract_target_from_pred(
fallback_mode (Literal["no_fallback", "first_match"], optional): How to handle extraction failures. Defaults to "no_fallback".
- "no_fallback": Return only successfully parsed match
- "first_match": Additionaly Include the first string match no matter how parsing finished
extraction_mode (Literal["first_match", "any_match"], optional): How to handle extraction failures. Defaults to "any_match".
- "first_match": Only tries to extract the first match
- "any_match": Tries to extract any match
Returns:
list: List of extracted predictions, with first fallbac string appended if fallback_mode is "first_match"
Expand All @@ -410,6 +429,7 @@ def extract_target_from_pred(
for target_patterns, target_type in target_res
for pattern, priority in target_patterns
]
match_found = False

# Group patterns by priority using itertools.groupby
for _, patterns_group in groupby(sorted(all_patterns, key=lambda x: x[2]), key=lambda x: x[2]):
Expand All @@ -426,6 +446,7 @@ def extract_target_from_pred(
# Try to extract from each match, starting from rightmost
for match, _, _, target_type in matches_with_pos:
extracted_match, str_fallback = extract_match(match, target_type)
match_found = True

if str_fallback:
fallbacks.append(str_fallback)
Expand All @@ -434,8 +455,11 @@ def extract_target_from_pred(
extracted_predictions.append(extracted_match)
break

if extraction_mode == "first_match":
break

# If we found something and we're in first_match mode, stop processing other priorities
if extracted_predictions:
if extracted_predictions or (match_found and extraction_mode == "first_match"):
break

if fallback_mode == "first_match" and fallbacks:
Expand Down
Loading

0 comments on commit ab38bd5

Please sign in to comment.