diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py index 45cc447835c..42ac58327ef 100644 --- a/src/helm/benchmark/metrics/evaluate_reference_metrics.py +++ b/src/helm/benchmark/metrics/evaluate_reference_metrics.py @@ -39,7 +39,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float: return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) -def normalize_text(text: str) -> str: +def normalize_text(text: str, should_remove_articles: bool = True) -> str: """Lower text and remove punctuation, articles and extra whitespace. Copied from the [QuAC](http://quac.ai/) evaluation script found at https://s3.amazonaws.com/my89public/quac/scorer.py""" @@ -57,7 +57,10 @@ def remove_punc(text: str) -> str: def lower(text: str) -> str: return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(text)))) + normalized_text = remove_punc(lower(text)) + if should_remove_articles: + normalized_text = remove_articles(normalized_text) + return white_space_fix(normalized_text) def exact_match(gold: str, pred: str) -> float: @@ -74,6 +77,17 @@ def quasi_exact_match(gold: str, pred: str) -> float: return 1 if normalize_text(gold) == normalize_text(pred) else 0 +def quasi_leave_articles_exact_match(gold: str, pred: str) -> float: + if not pred: + return 0 + + return ( + 1 + if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False) + else 0 + ) + + def prefix_exact_match(gold: str, pred: str) -> float: """ The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is @@ -423,6 +437,7 @@ def compute_metrics_helper( metric_fn_mapping: Dict[str, Callable] = { "exact_match": exact_match, "quasi_exact_match": quasi_exact_match, + "quasi_leave_articles_exact_match": quasi_leave_articles_exact_match, "prefix_exact_match": prefix_exact_match, "quasi_prefix_exact_match": quasi_prefix_exact_match, "exact_match_indicator": exact_match_indicator, diff --git a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf index a812bc06c76..db65d93c190 100644 --- a/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf +++ b/src/helm/benchmark/presentation/run_entries_vhelm_debug.conf @@ -1,26 +1,3 @@ entries: [ - - {description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} - {description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]} - {description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} - {description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} - {description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]} - {description: "mm_star:category=science_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]} - - {description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]} - {description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]} - - {description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]} - {description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]} - - {description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]} - {description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]} - {description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]} + {description: "real_world_qa:model=vlm", priority: 1} ] \ No newline at end of file diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index 5919166e0c9..3c92024e98e 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -107,7 +107,16 @@ def _get_multiple_choice_joint_adapter_spec( def _get_open_ended_generation_metric_specs() -> List[MetricSpec]: return get_basic_metric_specs( - ["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"] + [ + "exact_match", + "quasi_exact_match", + "quasi_leave_articles_exact_match", + "f1_score", + "rouge_l", + "bleu_1", + "bleu_4", + "cider", + ] ) diff --git a/src/helm/benchmark/static/schema_vhelm.yaml b/src/helm/benchmark/static/schema_vhelm.yaml index bd005ed2ccc..7e256ed4fb4 100644 --- a/src/helm/benchmark/static/schema_vhelm.yaml +++ b/src/helm/benchmark/static/schema_vhelm.yaml @@ -122,6 +122,11 @@ metrics: short_display_name: EM description: Fraction of instances that the predicted output matches a correct reference up to light processing. lower_is_better: false + - name: quasi_leave_articles_exact_match + display_name: Quasi-exact match + short_display_name: EM + description: Fraction of instances that the predicted output matches a correct reference up to light processing. + lower_is_better: false - name: prefix_exact_match display_name: Prefix exact match short_display_name: PEM @@ -902,7 +907,7 @@ run_groups: - accuracy - general_information environment: - main_name: quasi_prefix_exact_match + main_name: quasi_leave_articles_exact_match main_split: test taxonomy: task: short-answer question answering