Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move run specs for HELM capabilities to its module #3270

Merged
merged 2 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 230 additions & 0 deletions src/helm/benchmark/run_specs/capabilities_run_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
"""Run spec functions for the HELM Capabilities leaderboard.

Website: https://crfm.stanford.edu/helm/capabilities/"""

from helm.benchmark.adaptation.adapter_spec import (
ADAPT_GENERATION,
ADAPT_CHAT,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
AdapterSpec,
)
from helm.benchmark.adaptation.common_adapter_specs import get_multiple_choice_adapter_spec
from helm.benchmark.metrics.common_metric_specs import (
get_basic_metric_specs,
get_exact_match_metric_specs,
)
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.metrics.metric import MetricSpec
from helm.benchmark.annotation.annotator import AnnotatorSpec
from helm.benchmark.scenarios.scenario import ScenarioSpec


@run_spec_function("gpqa")
def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
use_few_shot_bool: bool = use_few_shot.lower() == "true"
del use_chain_of_thought
del use_few_shot

if not subset.startswith("gpqa_"):
subset = "gpqa_" + subset

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.gpqa_scenario.GPQAScenario", args={"subset": subset}
)
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_few_shot_bool:
if use_chain_of_thought_bool:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_tokens=2000, # original: 1000
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
chain_of_thought_prefix="Let's think step by step: ",
chain_of_thought_suffix="The correct answer is ",
output_noun="", # will be overwritten with output_prefix
output_prefix="",
global_suffix=(
"Give step by step reasoning before you answer, and when you’re ready to answer, "
'please use the format "The correct answer is (insert answer here)":'
),
)
else:
adapter_spec = get_multiple_choice_adapter_spec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
instructions=(
"Here are some example questions from experts. "
"An explanation is given before the final answer. "
"Answer the final question yourself, giving your reasoning beforehand."
),
input_noun="Question",
input_suffix="\nChoices: \n",
reference_prefix="(A) ",
output_noun="", # will be overwritten with output_prefix
output_prefix="The correct answer is ",
)
else:
if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=2000, # original: 1000
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=2000, # original: 1000
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
reference_prefix="(A) ",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)

metric_specs = (
(
get_basic_metric_specs([])
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
]
)
if use_chain_of_thought_bool
else get_exact_match_metric_specs()
)

return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["gpqa"],
)


@run_spec_function("ifeval")
def get_ifeval_spec() -> RunSpec:

scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.ifeval_scenario.IFEvalScenario")

adapter_spec = AdapterSpec(
method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
)

metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.ifeval_metrics.IFEvalMetric")]

return RunSpec(
name="ifeval",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=["ifeval"],
)


@run_spec_function("wildbench")
def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
args={
"subset": subset,
"use_model_outputs": use_model_outputs.lower() == "true",
},
)

adapter_spec = AdapterSpec(
method=ADAPT_CHAT, input_prefix="", output_prefix="", max_tokens=2000, num_outputs=1, temperature=0.0
)
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.wildbench_annotator.WildBenchAnnotator")]
metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.wildbench_metrics.WildBenchScoreMetric")]

return RunSpec(
name="wildbench",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["wildbench"],
)


@run_spec_function("bigcodebench")
def get_bigcodebench_spec(version: str) -> RunSpec:

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
)

# Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py
adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
input_prefix="",
output_prefix="",
max_tokens=2000, # original: 1280
num_outputs=1,
temperature=0.0,
global_prefix="Please provide a self-contained Python script "
"that solves the following problem in a markdown code block:",
)
annotator_specs = [
AnnotatorSpec(class_name="helm.benchmark.annotation.bigcodebench_annotator.BigCodeBenchAnnotator")
]
metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.bigcodebench_metrics.BigCodeBenchMetric")]

return RunSpec(
name="bigcodebench",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["bigcodebench"],
)


@run_spec_function("omni_math")
def get_omni_math_spec() -> RunSpec:

scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario")

adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
input_prefix="",
output_prefix="",
max_tokens=2000,
num_outputs=1,
temperature=0.0,
)
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.omni_math_annotator.OmniMATHAnnotator")]
metric_specs = get_basic_metric_specs([]) + [
MetricSpec(class_name="helm.benchmark.metrics.omni_math_metrics.OmniMATHMetric")
]

return RunSpec(
name="omni_math",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
metric_specs=metric_specs,
groups=["omni_math"],
)
Loading
Loading