From 2c2a93dc3fc65fc3d6a77fe97ec9df1108551075 Mon Sep 17 00:00:00 2001 From: Subhaditya Mukherjee <26865436+SubhadityaMukherjee@users.noreply.github.com> Date: Thu, 2 Jan 2025 23:13:14 +0100 Subject: [PATCH] Tiny bug : oml_id must be an int in load_openml_task (#686) * oml_id must be an int * Catch error earlier to report better error messages * Also give early warning for if task is defined as str in file --------- Co-authored-by: PGijsbers --- amlb/benchmarks/file.py | 5 ++++ amlb/benchmarks/openml.py | 33 +++++++++++++++-------- amlb/datasets/openml.py | 6 ++--- tests/unit/amlb/benchmarks/test_openml.py | 4 +-- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/amlb/benchmarks/file.py b/amlb/benchmarks/file.py index ec1f0f86f..034af74cc 100644 --- a/amlb/benchmarks/file.py +++ b/amlb/benchmarks/file.py @@ -35,4 +35,9 @@ def load_file_benchmark( log.info("Loading benchmark definitions from %s.", benchmark_file) tasks = config_load(benchmark_file) benchmark_name, _ = os.path.splitext(os.path.basename(benchmark_file)) + for task in tasks: + if task["openml_task_id"] is not None and not isinstance( + task["openml_task_id"], int + ): + raise TypeError("OpenML task id for task {task.name!r} must be integer.") return benchmark_name, benchmark_file, tasks diff --git a/amlb/benchmarks/openml.py b/amlb/benchmarks/openml.py index a0e8ae35c..4f1201060 100644 --- a/amlb/benchmarks/openml.py +++ b/amlb/benchmarks/openml.py @@ -6,6 +6,7 @@ import openml import pandas as pd +from openml import OpenMLTask, OpenMLDataset from amlb.utils import Namespace, str_sanitize @@ -20,7 +21,13 @@ def is_openml_benchmark(benchmark: str) -> bool: def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace]]: """Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y.""" - domain, oml_type, oml_id = benchmark.split("/") + domain, oml_type, oml_id_str = benchmark.split("/") + try: + oml_id = int(oml_id_str) + except ValueError: + raise ValueError( + f"Could not convert OpenML id {oml_id_str!r} in {benchmark!r} to integer." + ) if domain == "test.openml": log.debug("Setting openml server to the test server.") @@ -34,7 +41,7 @@ def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace] openml.config.set_retry_policy("robot") if oml_type == "t": - tasks = load_openml_task(domain, oml_id) + tasks = load_openml_task_as_definition(domain, oml_id) elif oml_type == "s": tasks = load_openml_tasks_from_suite(domain, oml_id) else: @@ -44,7 +51,7 @@ def load_oml_benchmark(benchmark: str) -> tuple[str, str | None, list[Namespace] return benchmark, None, tasks -def load_openml_tasks_from_suite(domain: str, oml_id: str) -> list[Namespace]: +def load_openml_tasks_from_suite(domain: str, oml_id: int) -> list[Namespace]: log.info("Loading openml suite %s.", oml_id) suite = openml.study.get_suite(oml_id) # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient @@ -66,18 +73,22 @@ def load_openml_tasks_from_suite(domain: str, oml_id: str) -> list[Namespace]: return tasks -def load_openml_task(domain: str, oml_id: str) -> list[Namespace]: +def load_openml_task_as_definition(domain: str, oml_id: int) -> list[Namespace]: log.info("Loading openml task %s.", oml_id) - # We first have the retrieve the task because we don't know the dataset id - t = openml.tasks.get_task(oml_id, download_data=False, download_qualities=False) - data = openml.datasets.get_dataset( - t.dataset_id, download_data=False, download_qualities=False - ) + task, data = load_openml_task_and_data(oml_id) return [ Namespace( name=str_sanitize(data.name), description=data.description, - openml_task_id=t.id, - id="{}.org/t/{}".format(domain, t.id), + openml_task_id=task.id, + id="{}.org/t/{}".format(domain, task.id), ) ] + + +def load_openml_task_and_data(task_id: int) -> tuple[OpenMLTask, OpenMLDataset]: + task = openml.tasks.get_task(task_id, download_data=False, download_qualities=False) + data = openml.datasets.get_dataset( + task.dataset_id, download_data=False, download_qualities=False + ) + return task, data diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py index 9f5839224..f6d15ee20 100644 --- a/amlb/datasets/openml.py +++ b/amlb/datasets/openml.py @@ -21,6 +21,7 @@ import openml as oml import xmltodict +from ..benchmarks.openml import load_openml_task_and_data from ..data import AM, DF, Dataset, DatasetType, Datasplit, Feature from ..datautils import impute_array from ..resources import config as rconfig, get as rget @@ -71,10 +72,7 @@ def load(self, task_id=None, dataset_id=None, fold=0): dataset_id, task_id ) ) - task = oml.tasks.get_task(task_id, download_qualities=False) - dataset = oml.datasets.get_dataset( - task.dataset_id, download_qualities=False - ) + task, dataset = load_openml_task_and_data(task_id) _, nfolds, _ = task.get_split_dimensions() if fold >= nfolds: raise ValueError( diff --git a/tests/unit/amlb/benchmarks/test_openml.py b/tests/unit/amlb/benchmarks/test_openml.py index f8c6bf5c9..0a1d8be3d 100644 --- a/tests/unit/amlb/benchmarks/test_openml.py +++ b/tests/unit/amlb/benchmarks/test_openml.py @@ -3,7 +3,7 @@ from amlb.benchmarks.openml import ( is_openml_benchmark, - load_openml_task, + load_openml_task_as_definition, load_oml_benchmark, ) from amlb.utils import Namespace @@ -35,7 +35,7 @@ def test_load_openml_task(mocker, oml_task, oml_dataset): mocker.patch( "openml.datasets.get_dataset", new=mocker.Mock(return_value=oml_dataset) ) - [task] = load_openml_task("openml", oml_task.id) + [task] = load_openml_task_as_definition("openml", oml_task.id) assert task.name == oml_dataset.name assert task.description == oml_dataset.description assert task.openml_task_id == oml_task.id