diff --git a/src/helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py b/src/helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py new file mode 100644 index 00000000000..4a0f849d334 --- /dev/null +++ b/src/helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py @@ -0,0 +1,65 @@ +"""Run spec functions for three clinical sections of MMLU human-translated into 11 African languages + +Available subjects: "clinical_knowledge", "college_medicine", "virology" +Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages) +""" + +from helm.benchmark.adaptation.adapter_spec import ( + ADAPT_GENERATION, + ADAPT_MULTIPLE_CHOICE_JOINT, + AdapterSpec, +) +from helm.benchmark.adaptation.common_adapter_specs import ( + get_generation_adapter_spec, + get_machine_translation_adapter_spec, + get_multiple_choice_adapter_spec, +) +from helm.benchmark.metrics.common_metric_specs import ( + get_basic_generation_metric_specs, + get_basic_metric_specs, + get_exact_match_metric_specs, + get_f1_metric_specs, + get_generative_harms_metric_specs, + get_generic_metric_specs, + get_open_ended_generation_metric_specs, +) +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.runner import get_benchmark_output_path +from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path + + +@run_spec_function("mmlu_clinical_afr") +def get_mmlu_clinical_afr_spec(subject: str, lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.mmlu_clinical_afr_scenario.MMLU_Clinical_Afr_Scenario", args={"subject": subject, "lang": lang} + ) + + lang_map = { + 'af': 'Afrikaans', + 'zu': 'Zulu', + 'xh': 'Xhosa', + 'am': 'Amharic', + 'bm': 'Bambara', + 'ig': 'Igbo', + 'nso': 'Sepedi', + 'sn': 'Shona', + 'st': 'Sesotho', + 'tn': 'Setswana', + 'ts': 'Tsonga', + } + + adapter_spec = get_multiple_choice_adapter_spec( + method=method, + instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')} " + f"in {lang_map[lang]}.", + input_noun="Question", + output_noun="Answer", + ) + + return RunSpec( + name=f"mmlu_clinical_afr:subject={subject},lang={lang},method={method}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["low_resource_languages"], + ) diff --git a/src/helm/benchmark/run_specs/winogrande_afr_run_specs.py b/src/helm/benchmark/run_specs/winogrande_afr_run_specs.py new file mode 100644 index 00000000000..cccbb61a82d --- /dev/null +++ b/src/helm/benchmark/run_specs/winogrande_afr_run_specs.py @@ -0,0 +1,64 @@ +"""Run spec functions for Winogrande human-translated into 11 African languages + +Available langs: "af", "zu", "xh", "am", "bm", "ig", "nso", "sn", "st", "tn", "ts" (see lang_map below for language code mapping to language name, or here for ISO code reference: https://huggingface.co/languages) +""" + +from helm.benchmark.adaptation.adapter_spec import ( + ADAPT_GENERATION, + ADAPT_MULTIPLE_CHOICE_JOINT, + AdapterSpec, +) +from helm.benchmark.adaptation.common_adapter_specs import ( + get_generation_adapter_spec, + get_machine_translation_adapter_spec, + get_multiple_choice_adapter_spec, +) +from helm.benchmark.metrics.common_metric_specs import ( + get_basic_generation_metric_specs, + get_basic_metric_specs, + get_exact_match_metric_specs, + get_f1_metric_specs, + get_generative_harms_metric_specs, + get_generic_metric_specs, + get_open_ended_generation_metric_specs, +) +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.runner import get_benchmark_output_path +from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path + + +@run_spec_function("winogrande_afr") +def get_winogrande_afr_spec(lang: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.winogrande_afr_scenario.Winogrande_Afr_Scenario", args={"lang": lang} + ) + + lang_map = { + 'af': 'Afrikaans', + 'zu': 'Zulu', + 'xh': 'Xhosa', + 'am': 'Amharic', + 'bm': 'Bambara', + 'ig': 'Igbo', + 'nso': 'Sepedi', + 'sn': 'Shona', + 'st': 'Sesotho', + 'tn': 'Setswana', + 'ts': 'Tsonga', + } + + adapter_spec = get_multiple_choice_adapter_spec( + method=method, + instructions=f"The following are binary choice fill-in-the-blank sentences (with answers), requiring common sense reasoning " + f"in {lang_map[lang]}.", + input_noun="Question", + output_noun="Answer", + ) + + return RunSpec( + name=f"winogrande_afr:lang={lang},method={method}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["low_resource_languages"], + ) diff --git a/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py b/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py new file mode 100644 index 00000000000..a2601940570 --- /dev/null +++ b/src/helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py @@ -0,0 +1,74 @@ +import csv +import os +from typing import Dict, List + +from helm.common.general import ensure_file_downloaded +from helm.common.hierarchical_logger import hlog +from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output + + +class MMLU_Clinical_Afr_Scenario(Scenario): + """ + https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages + """ + + name = "mmlu_clinical_afr" + description = "Massive Multitask Language Understanding (MMLU) translated into 11 African low-resource languages" + tags = ["knowledge", "multiple_choice", "low_resource_languages"] + + def __init__(self, subject: str = "clinical_knowledge", lang: str = "af"): + super().__init__() + self.subject: str = subject + self.lang: str = lang + + def download_mmlu_clinical_afr(self, path: str): + ensure_file_downloaded( + source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", + target_path=path, + unpack=True, + unpack_type='unzip' + ) + + def process_csv(self, csv_path: str, split: str) -> List[Instance]: + instances: List[Instance] = [] + hlog(f"Reading {csv_path}") + with open(csv_path) as f: + reader = csv.reader(f, delimiter=",") + for row in reader: + + question, answers, correct_choice = row[0], row[1:-1], row[-1] + answers_dict = dict(zip(["A", "B", "C", "D"], answers)) + correct_answer: str = answers_dict[correct_choice] + + def answer_to_reference(answer: str) -> Reference: + return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) + + instance = Instance( + input=Input(text=question), + references=list(map(answer_to_reference, answers)), + split=split, + ) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + # Download the raw data + desired_dir = 'mmlu_cm_ck_vir' + data_path: str = os.path.join(output_path, desired_dir) + self.download_mmlu_clinical_afr(data_path) + + # Read all the instances + instances: List[Instance] = [] + splits: Dict[str, str] = { + "dev": TRAIN_SPLIT, + "val": VALID_SPLIT, + "test": TEST_SPLIT, + } + for split in splits: + csv_path: str = os.path.join(data_path, desired_dir, f"{self.subject}_{split}_{self.lang}.csv") + if not os.path.exists(csv_path): + hlog(f"{csv_path} doesn't exist, skipping") + continue + instances.extend(self.process_csv(csv_path, splits[split])) + + return instances diff --git a/src/helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py b/src/helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py new file mode 100644 index 00000000000..9de8d9804c9 --- /dev/null +++ b/src/helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py @@ -0,0 +1,21 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.mmlu_clinical_afr_scenario import MMLU_Clinical_Afr_Scenario +from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference + + +@pytest.mark.scenarios +def test_mmlu_clinical_afr_scenario(): + with TemporaryDirectory() as tmpdir: + scenario = MMLU_Clinical_Afr_Scenario(subject="clinical_knowledge", lang="am") + instances = scenario.get_instances(tmpdir) + assert len(instances) == 299 + assert instances[0].input == Input(text="ለሁሉም የጡንቻ መኮማተር ዓይነቶች የሚያስፈልገው ኢኔርጅ የሚቀርበው ከሚከተሉት ነው፦") + assert instances[0].references == [ + Reference(output=Output(text="ATP።"), tags=[CORRECT_TAG]), + Reference(output=Output(text="ADP።"), tags=[]), + Reference(output=Output(text="ፎስፎክሬቲን።"), tags=[]), + Reference(output=Output(text="ኦክስዳቲቪ ፎስፎሪሌሽን።"), tags=[]), + ] + assert instances[0].split == "train" diff --git a/src/helm/benchmark/scenarios/test_winogrande_afr_scenario.py b/src/helm/benchmark/scenarios/test_winogrande_afr_scenario.py new file mode 100644 index 00000000000..bbdf997013b --- /dev/null +++ b/src/helm/benchmark/scenarios/test_winogrande_afr_scenario.py @@ -0,0 +1,19 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.winogrande_afr_scenario import Winogrande_Afr_Scenario +from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference + + +@pytest.mark.scenarios +def test_winogrande_afr_scenario(): + with TemporaryDirectory() as tmpdir: + scenario = Winogrande_Afr_Scenario(lang="am") + instances = scenario.get_instances(tmpdir) + assert len(instances) == 3674 + assert instances[0].input == Input(text="ሳራ ከማሪያ በጣም የተሻለች የቀዶ ጥገና ሐኪም ስለነበረች ሁልጊዜ _ ቀላል ህመሞችን ታክማለች.") + assert instances[0].references == [ + Reference(output=Output(text="ሳራ"), tags=[]), + Reference(output=Output(text="ማሪያ"), tags=[CORRECT_TAG]), + ] + assert instances[0].split == "train" diff --git a/src/helm/benchmark/scenarios/winogrande_afr_scenario.py b/src/helm/benchmark/scenarios/winogrande_afr_scenario.py new file mode 100644 index 00000000000..b5dbdf571b1 --- /dev/null +++ b/src/helm/benchmark/scenarios/winogrande_afr_scenario.py @@ -0,0 +1,78 @@ +import csv +import os +from typing import Dict, List + +from helm.common.general import ensure_file_downloaded +from helm.common.hierarchical_logger import hlog +from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output + + +class Winogrande_Afr_Scenario(Scenario): + """ + https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages + """ + + name = "winogrande_afr" + description = "Winogrande (S) translated into 11 African low-resource languages" + tags = ["knowledge", "multiple_choice", "low_resource_languages"] + + def __init__(self, lang: str = "af"): + super().__init__() + self.lang: str = lang + + def download_winogrande_afr(self, path: str): + ensure_file_downloaded( + source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", + target_path=path, + unpack=True, + unpack_type='unzip' + ) + + def process_csv(self, csv_path: str, split: str, pseudo_split: str) -> List[Instance]: + # Match naming in Winogrande + if pseudo_split == 'val': + pseudo_split = 'train_s' + instances: List[Instance] = [] + hlog(f"Reading {csv_path}") + with open(csv_path) as f: + reader = csv.reader(f, delimiter=",") + next(reader, None) # skip the header + for row in reader: + if row[-1] != pseudo_split: # ensure correct split is taken + continue + question, answers, correct_choice = row[-5], row[-4:-2], row[-2] + answers_dict = dict(zip(["1", "2"], answers)) + correct_answer: str = answers_dict[correct_choice] + + def answer_to_reference(answer: str) -> Reference: + return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else []) + + instance = Instance( + input=Input(text=question), + references=list(map(answer_to_reference, answers)), + split=split, + ) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + # Download the raw data + desired_dir = 'winogrande_s' + data_path: str = os.path.join(output_path, desired_dir) + self.download_winogrande_afr(data_path) + + # Read all the instances + instances: List[Instance] = [] + splits: Dict[str, str] = { + "dev": TRAIN_SPLIT, + "val": VALID_SPLIT, + "test": TEST_SPLIT, + } + for split in splits: + csv_path: str = os.path.join(data_path, desired_dir, f"winogrande_{self.lang}.csv") + if not os.path.exists(csv_path): + hlog(f"{csv_path} doesn't exist, skipping") + continue + instances.extend(self.process_csv(csv_path, splits[split], split)) + + return instances