diff --git a/src/helm/benchmark/run_specs/capabilities_run_specs.py b/src/helm/benchmark/run_specs/capabilities_run_specs.py new file mode 100644 index 0000000000..44074d8ebc --- /dev/null +++ b/src/helm/benchmark/run_specs/capabilities_run_specs.py @@ -0,0 +1,230 @@ +"""Run spec functions for the HELM Capabilities leaderboard. + +Website: https://crfm.stanford.edu/helm/capabilities/""" + +from helm.benchmark.adaptation.adapter_spec import ( + ADAPT_GENERATION, + ADAPT_CHAT, + ADAPT_MULTIPLE_CHOICE_JOINT, + ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + AdapterSpec, +) +from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec +from helm.benchmark.metrics.common_metric_specs import ( + get_basic_metric_specs, + get_exact_match_metric_specs, +) +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.metrics.metric import MetricSpec +from helm.benchmark.annotation.annotator import AnnotatorSpec +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("gpqa") +def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true" + use_few_shot_bool: bool = use_few_shot.lower() == "true" + del use_chain_of_thought + del use_few_shot + + if not subset.startswith("gpqa_"): + subset = "gpqa_" + subset + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} + ) + max_train_instance_num = 5 if use_few_shot_bool else 0 + + if use_few_shot_bool: + if use_chain_of_thought_bool: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_tokens=2000, # original: 1000 + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + chain_of_thought_prefix="Let's think step by step: ", + chain_of_thought_suffix="The correct answer is ", + output_noun="", # will be overwritten with output_prefix + output_prefix="", + global_suffix=( + "Give step by step reasoning before you answer, and when you’re ready to answer, " + 'please use the format "The correct answer is (insert answer here)":' + ), + ) + else: + adapter_spec = get_multiple_choice_adapter_spec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + instructions=( + "Here are some example questions from experts. " + "An explanation is given before the final answer. " + "Answer the final question yourself, giving your reasoning beforehand." + ), + input_noun="Question", + input_suffix="\nChoices: \n", + reference_prefix="(A) ", + output_noun="", # will be overwritten with output_prefix + output_prefix="The correct answer is ", + ) + else: + if use_chain_of_thought_bool: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_train_instances=max_train_instance_num, + max_tokens=2000, # original: 1000 + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=( + "Let’s think step by step. Based on your reasoning, what is the single, " + "most likely answer choice? Format your response as follows: " + '"The correct answer is (insert answer here)".' + ), + ) + else: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + max_tokens=2000, # original: 1000 + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + reference_prefix="(A) ", + global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), + ) + + metric_specs = ( + ( + get_basic_metric_specs([]) + + [ + MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), + ] + ) + if use_chain_of_thought_bool + else get_exact_match_metric_specs() + ) + + return RunSpec( + name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["gpqa"], + ) + + +@run_spec_function("ifeval") +def get_ifeval_spec() -> RunSpec: + + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario") + + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0 + ) + + metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.ifeval_metrics.IFEvalMetric")] + + return RunSpec( + name="ifeval", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=["ifeval"], + ) + + +@run_spec_function("wildbench") +def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec: + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario", + args={ + "subset": subset, + "use_model_outputs": use_model_outputs.lower() == "true", + }, + ) + + adapter_spec = AdapterSpec( + method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0 + ) + annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")] + metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")] + + return RunSpec( + name="wildbench", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["wildbench"], + ) + + +@run_spec_function("bigcodebench") +def get_bigcodebench_spec(version: str) -> RunSpec: + + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version} + ) + + # Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, + input_prefix="", + output_prefix="", + max_tokens=2000, # original: 1280 + num_outputs=1, + temperature=0.0, + global_prefix="Please provide a self-contained Python script " + "that solves the following problem in a markdown code block:", + ) + annotator_specs = [ + AnnotatorSpec(class_name="helm.benchmark.annotation.bigcodebench_annotator.BigCodeBenchAnnotator") + ] + metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.bigcodebench_metrics.BigCodeBenchMetric")] + + return RunSpec( + name="bigcodebench", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["bigcodebench"], + ) + + +@run_spec_function("omni_math") +def get_omni_math_spec() -> RunSpec: + + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario") + + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, + input_prefix="", + output_prefix="", + max_tokens=2000, + num_outputs=1, + temperature=0.0, + ) + annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.omni_math_annotator.OmniMATHAnnotator")] + metric_specs = get_basic_metric_specs([]) + [ + MetricSpec(class_name="helm.benchmark.metrics.omni_math_metrics.OmniMATHMetric") + ] + + return RunSpec( + name="omni_math", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + annotators=annotator_specs, + metric_specs=metric_specs, + groups=["omni_math"], + ) diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index b6363a2af3..b034ff48ec 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -4,7 +4,6 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_GENERATION, - ADAPT_CHAT, ADAPT_MULTIPLE_CHOICE_JOINT, ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, AdapterSpec, @@ -27,7 +26,6 @@ from helm.benchmark.runner import get_benchmark_output_path from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path from helm.benchmark.metrics.metric import MetricSpec -from helm.benchmark.annotation.annotator import AnnotatorSpec @run_spec_function("narrative_qa") @@ -365,213 +363,3 @@ def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec metric_specs=get_open_ended_generation_metric_specs(), groups=["wmt_14"], ) - - -@run_spec_function("gpqa") -def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: - # Convert to bools and remove the str versions - use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true" - use_few_shot_bool: bool = use_few_shot.lower() == "true" - del use_chain_of_thought - del use_few_shot - - if not subset.startswith("gpqa_"): - subset = "gpqa_" + subset - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset} - ) - max_train_instance_num = 5 if use_few_shot_bool else 0 - - if use_few_shot_bool: - if use_chain_of_thought_bool: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_tokens=2000, # original: 1000 - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - chain_of_thought_prefix="Let's think step by step: ", - chain_of_thought_suffix="The correct answer is ", - output_noun="", # will be overwritten with output_prefix - output_prefix="", - global_suffix=( - "Give step by step reasoning before you answer, and when you’re ready to answer, " - 'please use the format "The correct answer is (insert answer here)":' - ), - ) - else: - adapter_spec = get_multiple_choice_adapter_spec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - instructions=( - "Here are some example questions from experts. " - "An explanation is given before the final answer. " - "Answer the final question yourself, giving your reasoning beforehand." - ), - input_noun="Question", - input_suffix="\nChoices: \n", - reference_prefix="(A) ", - output_noun="", # will be overwritten with output_prefix - output_prefix="The correct answer is ", - ) - else: - if use_chain_of_thought_bool: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_train_instances=max_train_instance_num, - max_tokens=2000, # original: 1000 - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=( - "Let’s think step by step. Based on your reasoning, what is the single, " - "most likely answer choice? Format your response as follows: " - '"The correct answer is (insert answer here)".' - ), - ) - else: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - max_tokens=2000, # original: 1000 - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - reference_prefix="(A) ", - global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), - ) - - metric_specs = ( - ( - get_basic_metric_specs([]) - + [ - MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), - ] - ) - if use_chain_of_thought_bool - else get_exact_match_metric_specs() - ) - - return RunSpec( - name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - groups=["gpqa"], - ) - - -@run_spec_function("ifeval") -def get_ifeval_spec() -> RunSpec: - - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario") - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0 - ) - - metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.ifeval_metrics.IFEvalMetric")] - - return RunSpec( - name="ifeval", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - groups=["ifeval"], - ) - - -@run_spec_function("wildbench") -def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec: - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario", - args={ - "subset": subset, - "use_model_outputs": use_model_outputs.lower() == "true", - }, - ) - - adapter_spec = AdapterSpec( - method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0 - ) - annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")] - metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")] - - return RunSpec( - name="wildbench", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - annotators=annotator_specs, - metric_specs=metric_specs, - groups=["wildbench"], - ) - - -@run_spec_function("bigcodebench") -def get_bigcodebench_spec(version: str) -> RunSpec: - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version} - ) - - # Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - input_prefix="", - output_prefix="", - max_tokens=2000, # original: 1280 - num_outputs=1, - temperature=0.0, - global_prefix="Please provide a self-contained Python script " - "that solves the following problem in a markdown code block:", - ) - annotator_specs = [ - AnnotatorSpec(class_name="helm.benchmark.annotation.bigcodebench_annotator.BigCodeBenchAnnotator") - ] - metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.bigcodebench_metrics.BigCodeBenchMetric")] - - return RunSpec( - name="bigcodebench", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - annotators=annotator_specs, - metric_specs=metric_specs, - groups=["bigcodebench"], - ) - - -@run_spec_function("omni_math") -def get_omni_math_spec() -> RunSpec: - - scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario") - - adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, - input_prefix="", - output_prefix="", - max_tokens=2000, - num_outputs=1, - temperature=0.0, - ) - annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.omni_math_annotator.OmniMATHAnnotator")] - metric_specs = get_basic_metric_specs([]) + [ - MetricSpec(class_name="helm.benchmark.metrics.omni_math_metrics.OmniMATHMetric") - ] - - return RunSpec( - name="omni_math", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - annotators=annotator_specs, - metric_specs=metric_specs, - groups=["omni_math"], - )