Skip to content

Commit

Permalink
Merge pull request #16 from allenai/non-tango-pipeline
Browse files Browse the repository at this point in the history
Non tango pipeline
  • Loading branch information
AkshitaB authored Dec 5, 2023
2 parents 676cfa4 + b67874f commit b56a989
Show file tree
Hide file tree
Showing 6 changed files with 399 additions and 6 deletions.
18 changes: 14 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased
- Updated code that records the fine-grained perplexity metrics per subdomain to also include perplexity over words, characters, bytes, and also bits per byte.
- fixed incorrect paths in readme
- updated default image in tango-in-beaker.yml
- adds option to track avg logit per token type

### Added

- Updated code that records the fine-grained perplexity metrics per subdomain to also include perplexity over words, characters, bytes, and also bits per byte
- Added option to track avg logit per token type
- Added script that uses the tango steps as functions, and bypasses the tango caching mechanism, for simpler execution

### Fixed

- Fixed incorrect paths in readme

### Changed

- Updated default image in tango-in-beaker.yml
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,39 @@ export GITHUB_TOKEN="<your token>" # Needed for beaker to clone the repo.
tango --settings tango-in-beaker.yml run configs/evaluation_template.jsonnet
```


### Running simple pipeline as single beaker job

The `llm_eval/run_lm_eval.py` script provides a way to run an evaluation as a single beaker
job with associated result set. Arguments can be provided in a config file, an example is found
in `configs/run_lm_eval_example.jsonnet`, or as direct arguments (see documentation in script). E.g.,

```commandline
python -m llm_eval.run_lm_eval --config_file configs/run_lm_eval_example.jsonnet
```
or
```commandline
python -m llm_eval.run_lm_eval --model lm::pretrained=EleutherAI/pythia-160m,revision=step140000 \
--task arc_challenge arc_easy --split validation \
--full_output_file predictions.jsonl --metrics_file metrics.json --model_max_length 2048 \
--max_batch_tokens 4096 --num_recorded_inputs 3 --num_shots 0 --gsheet OLMo-evals-testing
```

To launch a job in beaker, it's easiest to use [beaker-gantry](https://github.com/allenai/beaker-gantry), e.g.,
```commandline
gantry run --gpus 1 --venv base --workspace ai2/lm-eval --cluster ai2/aristo-cirrascale \
--beaker-image oyvindt/OLMoEvalLatest \
--env 'HF_DATASETS_CACHE=/net/nfs.cirrascale/aristo/oyvindt/hf_datasets_cache' -- \
python llm_eval/run_lm_eval.py \
--model lm::pretrained=EleutherAI/pythia-160m,revision=step140000 \
--task arc_challenge arc_easy boolq --split validation \
--full_output_file /results/predictions.jsonl --metrics_file /results/metrics.json \
--model_max_length 2048 --max_batch_tokens 4096 --num_recorded_inputs 3 \
--num_shots 0 --gsheet OLMo-evals-testing
```
or reference a config file, either in `nfs.cirrascale` or a beaker dataset (which can be mounted
in the gantry command).

### Troubleshooting

If some error causes the workspace to go into a bad state (i.e., you get errors that say step should not be in completed state, etc.), you can clear the workspace with
Expand Down
16 changes: 16 additions & 0 deletions configs/run_lm_eval_example.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
model_path: "EleutherAI/pythia-160m",
revision: "step140000",
gpus_needed: 1, // Not used here, but useful for reference
model_max_length: 2048,
max_batch_tokens: 20480,
task: ["arc_challenge", "arc_easy"],
split: "validation",
limit: 10,
num_shots: 0,
random_subsample_seed: 1234,
num_recorded_inputs: 3,
full_output_file: "predictions.jsonl", // Set to "/results/predictions.jsonl" in beaker jobs
metrics_file: "metrics.json", // Set to "/results/metrics.jsonl" in beaker jobs
gsheet: "OLMo-evals-testing" // Set to null if no Google Sheet is needed
}
245 changes: 245 additions & 0 deletions llm_eval/run_lm_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
import argparse
import json
import logging
import os

import torch
from catwalk.dependencies.lm_eval.utils import simple_parse_args_string
from catwalk.models import MODELS
from catwalk.tasks import TASK_SETS
from catwalk.utils import filter_dict_keys, sanitize
from rjsonnet import evaluate_file
from tango.common.logging import initialize_logging

from llm_eval.steps import (
ConstructCatwalkModel,
ConstructTaskDict,
PredictAndCalculateMetricsStep,
ProcessOutputs,
WriteOutputsAsRows,
)

# Catwalk eval script which is focused on LM models referenced on the fly

_parser = argparse.ArgumentParser()
_parser.add_argument("--config-file", type=str, required=False, help="Config file for evaluation")
_parser.add_argument("--model", type=str, required=False, help="Name of model")
_parser.add_argument("--task", type=str, nargs="+")
_parser.add_argument("--task-file", type=str, help="Jsonl file with task specs")
_parser.add_argument("--split", type=str, default="validation")
_parser.add_argument("--batch-size", type=int, default=32)
_parser.add_argument("--max-batch-tokens", type=int, help="Limit batch size to max tokens")
_parser.add_argument(
"--model-max-length", type=int, help="Max input length the model should accept"
)
_parser.add_argument("--num-shots", type=int, help="Number of examples in prompt")
_parser.add_argument(
"--fewshot-seed",
type=int,
help="Random seed for picking fixed prompt examples, leave out for varied examples",
)
_parser.add_argument("--limit", type=int, help="Max number of instances for a task")
_parser.add_argument(
"--full-output-file", type=str, default=None, help="Filename for verbose output"
)
_parser.add_argument("--metrics-file", type=str, default=None, help="Filename for metrics output")
_parser.add_argument(
"--num-recorded-inputs",
type=int,
default=0,
help="Number of sample model inputs in full output, for sanity checks",
)
_parser.add_argument("--model-path", type=str, help="Explicit path to load model from")
_parser.add_argument("--model-class", type=str, help="Custom Python class for loading model")
_parser.add_argument(
"--random-subsample-seed",
type=int,
help="Random seed for subsampling task instances using limit",
)
_parser.add_argument("--gsheet", type=str, help="Name of Google Sheet for writing results")


def main(args: argparse.Namespace):
initialize_logging(log_level="INFO")
logger = logging.getLogger()
if args.config_file and args.model:
raise ValueError("Cannot specify both --config-file and --model arguments")
args_dict = vars(args)
args_raw = {}
if args.config_file:
args_raw = json.loads(evaluate_file(args.config_file))
for arg, value in args_raw.items():
args_dict[arg] = value
if not args_dict["model"]:
if "::" in args_dict:
args_dict["model"] = args_dict["model_path"]
else:
args_dict["model"] = f"lm::pretrained={args_dict['model_path'].replace('/', '-')}"

# Some shenanigans to map from run_lm_eval argument structure to ConstructCatwalkModel format
# TODO fix this mess!
hf_name = args_dict["model"]
model_args = {}
if hf_name not in MODELS:
prefix_split = hf_name.split("::", 1)
model_name = prefix_split[-1]
model_args = simple_parse_args_string(model_name)
if "pretrained" not in model_args:
raise ValueError(f"Unknown model {hf_name}")
hf_name = model_args["pretrained"]
del model_args["pretrained"]
for key in ["revision", "trust_remote_code"]:
if args_raw.get(key):
model_args[key] = args_raw[key]
if args_dict["model_path"]:
hf_name = args_dict["model_path"]

model_obj = ConstructCatwalkModel(cache_results=False).run(
model_path=hf_name, model_class=args_dict["model_class"], **model_args
)

task_args = [
"limit",
"split",
"batch_size",
"model_max_length",
"max_batch_tokens",
"num_shots",
"fewshot_seed",
"num_recorded_inputs",
"random_subsample_seed",
]

default_task_args = {k: v for k, v in args_dict.items() if k in task_args and v is not None}
if "limit" not in default_task_args:
default_task_args["limit"] = None # To override weird default in run_catwalk.py

# TODO: Should be able to remove these next lines until ConstructTaskDict
tasks = []
task_names = set()
if args_dict["task_file"]:
with open(args_dict["task_file"], "r") as file:
for line in file:
line = line.strip()
if line and not line.startswith("#"):
task_spec = json.loads(line)
tasks.append(task_spec)
task_names.add(task_spec["name"])

if args_dict["task"]:
for task in args_dict["task"]:
if task in TASK_SETS:
raise ValueError("Task sets not supported!")
if task in task_names:
continue
task_names.add(task)
tasks.append({"name": task})

if not tasks:
raise ValueError("No tasks specified!")

# Normalize the tasks, check that they exist, etc
task_dicts = []
construct_task_step = ConstructTaskDict(cache_results=False)
for task in tasks:
task_dicts.append(
construct_task_step.run(task_name=task["name"], **task, **default_task_args)
)

# Initial loading of model done here for early failures and overrides if needed
if hasattr(model_obj, "_make_model"):
logger.info("Loading model...")
model_cached = model_obj._make_model(
model_obj.pretrained_model_name_or_path,
device_map="auto" if torch.cuda.device_count() > 0 else None,
**model_obj.model_kwargs,
).eval()
if not hasattr(model_cached, "tokenizer"):
_ = model_obj._make_tokenizer()

# unconditioned_prompt is taken separately from task_dict, so not on this list
valid_model_args = [
"split",
"limit",
"batch_size",
"max_batch_tokens",
"num_shots",
"model_max_length",
"fewshot_seed",
"num_recorded_inputs",
"random_subsample_seed",
]
logged_output_keys = [
"task",
"model",
"task_options",
"metrics",
"num_instances",
"processing_time_seconds",
]

verbose_output = []
beaker_env_variables = {k: v for k, v in os.environ.items() if "BEAKER" in k}
predict_step = PredictAndCalculateMetricsStep(cache_results=False)
for task_dict in task_dicts:
task_name = task["name"]
logger.info(f"Processing task: {task_name}")
output = predict_step.run(
model_obj, task_dict, **filter_dict_keys(task_dict, valid_model_args)
)
output = ProcessOutputs().run(output)
if beaker_env_variables:
output["beaker_info"] = beaker_env_variables
logger.info(
f"Results from task {task_name}: {filter_dict_keys(output, logged_output_keys)}"
)
logger.info(
f"First instance details for task {task_name}: {output['instance_predictions'][0]}"
)
verbose_output.append(output)
if args_dict["full_output_file"]:
logger.info(f"Saving full output in {args_dict['full_output_file']}...")
with open(args_dict["full_output_file"], "w") as file:
for d in verbose_output:
file.write(json.dumps(sanitize(d)) + "\n")

num_tasks = len(verbose_output)
if args_dict["gsheet"]:
try:
_ = WriteOutputsAsRows(cache_results=False).run(
[hf_name] * num_tasks,
verbose_output,
task_dicts,
simple_pipeline=True,
gsheet=args_dict["gsheet"],
)
except Exception as e:
logger.warning(f"Something went wrong when writing Google Sheet: {e}")

if args_dict["metrics_file"]:
logger.info(f"Saving metrics in {args_dict['metrics_file']}...")
with open(args_dict["metrics_file"], "w") as file:
for d in verbose_output:
del d["instance_predictions"] # Destructive
file.write(json.dumps(sanitize({"metrics": verbose_output})))

metrics_printed = []
for d in verbose_output:
metrics_printed.append(
f" *** {d['task']} *** (n = {d['num_instances']}) [{d['task_options']}]"
)
metrics = {}
# Code is a bit confused about nestedness of metrics
for metric_name, metric in d["metrics"].items():
if isinstance(metric, dict):
metrics.update(metric)
else:
metrics[metric_name] = metric
for metric_name, metric in metrics.items():
metrics_printed.append(f" {metric_name}: {metric}")
metrics_printed.append("-----------------")
logger.info("Overall metrics:\n " + "\n".join(metrics_printed))


if __name__ == "__main__":
main(_parser.parse_args())
17 changes: 15 additions & 2 deletions llm_eval/steps/run_catwalk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import json
import logging
import math
import os
Expand Down Expand Up @@ -348,6 +349,7 @@ def run(
models: List[str],
outputs: List[Dict],
prediction_kwargs: List[Dict],
simple_pipeline: bool = False,
gsheet: Optional[str] = None,
) -> List:
tsv_outputs = []
Expand All @@ -373,10 +375,21 @@ def run(
row["metric"] = metrics_dict[primary_metric]
row["processing_time"] = d["processing_time"]
row["num_instances"] = d["num_instances"]
row["tango_workspace"] = self.workspace.url
row["tango_step"] = self.unique_id
if not simple_pipeline:
row["tango_workspace"] = self.workspace.url
row["tango_step"] = self.unique_id

row.update(pred_kwargs)
if simple_pipeline:
row["all_metrics"] = json.dumps(metrics_dict)
row["beaker_id"] = d.get("beaker_info", {}).get("BEAKER_EXPERIMENT_ID", "")
if "name" in row:
del row["name"] # Stored as "task"
if "task_obj" in row:
del row["task_obj"]
if "num_recorded_inputs" in row:
del row["num_recorded_inputs"]

tsv_outputs.append(row)

if gsheet:
Expand Down
Loading

0 comments on commit b56a989

Please sign in to comment.