From a5d192c77daedbb96648fdcf31f0bf3239ae134d Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 27 Nov 2024 18:25:43 +0200
Subject: [PATCH] Remove from_config and only keep from_configs
---
amlb/resources.py | 226 ++++++++++++------
runscores.py | 34 ---
.../datasets/file/test_file_dataloader.py | 178 +++++++++-----
.../datasets/openml/test_openml_dataloader.py | 79 +++---
4 files changed, 321 insertions(+), 196 deletions(-)
delete mode 100644 runscores.py
diff --git a/amlb/resources.py b/amlb/resources.py
index 617a4d609..0e37b240d 100644
--- a/amlb/resources.py
+++ b/amlb/resources.py
@@ -2,6 +2,7 @@
**resources** modules exposes a singleton ``Resources`` instance providing easy access to app configuration properties,
as well as handy methods to access other resources like *automl frameworks* and *benchmark definitions*
"""
+
from __future__ import annotations
import copy
@@ -13,7 +14,15 @@
from amlb.benchmarks.parser import benchmark_load
from amlb.frameworks import default_tag, load_framework_definitions
-from .utils import Namespace, lazy_property, memoize, normalize_path, run_cmd, str_sanitize, touch
+from .utils import (
+ Namespace,
+ lazy_property,
+ memoize,
+ normalize_path,
+ run_cmd,
+ str_sanitize,
+ touch,
+)
from .utils.config import TransformRule, config_load, transform_config
from .__version__ import __version__, _dev_version as dev
@@ -22,7 +31,6 @@
class Resources:
-
@staticmethod
def _normalize(config: Namespace, replace=None):
def nz_path(path):
@@ -34,8 +42,10 @@ def nz_path(path):
for k, v in config:
if isinstance(v, Namespace):
normalized[k] = Resources._normalize(v, replace=replace)
- elif re.search(r'_(dir|file|cmd)s?$', k):
- normalized[k] = [nz_path(p) for p in v] if isinstance(v, list) else nz_path(v)
+ elif re.search(r"_(dir|file|cmd)s?$", k):
+ normalized[k] = (
+ [nz_path(p) for p in v] if isinstance(v, list) else nz_path(v)
+ )
return normalized
def __init__(self, config: Namespace):
@@ -51,20 +61,16 @@ def __init__(self, config: Namespace):
log.debug("Using config:\n%s", self.config)
# allowing to load custom modules from user directory
- sys.path.append(common_dirs['user'])
+ sys.path.append(common_dirs["user"])
log.debug("Extended Python sys.path to user directory: %s.", sys.path)
@lazy_property
def project_info(self):
- split_url = self.config.project_repository.split('#', 1)
+ split_url = self.config.project_repository.split("#", 1)
repo = split_url[0]
tag = None if len(split_url) == 1 else split_url[1]
- branch = tag or 'master'
- return Namespace(
- repo=repo,
- tag=tag,
- branch=branch
- )
+ branch = tag or "master"
+ return Namespace(repo=repo, tag=tag, branch=branch)
@lazy_property
def git_info(self):
@@ -88,11 +94,7 @@ def git(cmd, defval=None):
repo = branch = commit = na
tags = status = []
return Namespace(
- repo=repo,
- branch=branch,
- commit=commit,
- tags=tags,
- status=status
+ repo=repo, branch=branch, commit=commit, tags=tags, status=status
)
@lazy_property
@@ -109,17 +111,19 @@ def app_version(self):
return "{v} [{details}]".format(v=v, details=", ".join(tokens))
def seed(self, fold=None):
- if isinstance(fold, int) and str(self.config.seed).lower() in ['auto']:
- return fold+self._seed
+ if isinstance(fold, int) and str(self.config.seed).lower() in ["auto"]:
+ return fold + self._seed
else:
return self._seed
@lazy_property
def _seed(self):
- if str(self.config.seed).lower() in ['none', '']:
+ if str(self.config.seed).lower() in ["none", ""]:
return None
- elif str(self.config.seed).lower() in ['auto']:
- return random.randint(1, (1 << 31) - 1) # limiting seed to signed int32 for R frameworks
+ elif str(self.config.seed).lower() in ["auto"]:
+ return random.randint(
+ 1, (1 << 31) - 1
+ ) # limiting seed to signed int32 for R frameworks
else:
return self.config.seed
@@ -132,20 +136,34 @@ def framework_definition(self, name, tag=None):
if tag is None:
tag = default_tag
if tag not in self._frameworks:
- raise ValueError("Incorrect tag `{}`: only those among {} are allowed.".format(tag, self.config.frameworks.tags))
+ raise ValueError(
+ "Incorrect tag `{}`: only those among {} are allowed.".format(
+ tag, self.config.frameworks.tags
+ )
+ )
frameworks = self._frameworks[tag]
log.debug("Available framework definitions:\n%s", frameworks)
framework = next((f for n, f in frameworks if n.lower() == lname), None)
# TODO: Clean up this workflow and error messaging as part of #518
- base_framework = next((f for n, f in self._frameworks[default_tag] if n.lower() == lname), None)
- if framework and framework['removed']:
- raise ValueError(f"Framework definition `{name}` has been removed from the benchmark: {framework['removed']}")
- if not framework and (base_framework and base_framework['removed']):
- raise ValueError(f"Framework definition `{name}` has been removed from the benchmark: {base_framework['removed']}")
+ base_framework = next(
+ (f for n, f in self._frameworks[default_tag] if n.lower() == lname), None
+ )
+ if framework and framework["removed"]:
+ raise ValueError(
+ f"Framework definition `{name}` has been removed from the benchmark: {framework['removed']}"
+ )
+ if not framework and (base_framework and base_framework["removed"]):
+ raise ValueError(
+ f"Framework definition `{name}` has been removed from the benchmark: {base_framework['removed']}"
+ )
if not framework:
- raise ValueError(f"Incorrect framework `{name}`: not listed in {self.config.frameworks.definition_file}.")
- if framework['abstract']:
- raise ValueError(f"Framework definition `{name}` is abstract and cannot be run directly.")
+ raise ValueError(
+ f"Incorrect framework `{name}`: not listed in {self.config.frameworks.definition_file}."
+ )
+ if framework["abstract"]:
+ raise ValueError(
+ f"Framework definition `{name}` is abstract and cannot be run directly."
+ )
return framework, framework.name
@lazy_property
@@ -161,7 +179,11 @@ def constraint_definition(self, name):
"""
constraint = self._constraints[name.lower()]
if not constraint:
- raise ValueError("Incorrect constraint definition `{}`: not listed in {}.".format(name, self.config.benchmarks.constraints_file))
+ raise ValueError(
+ "Incorrect constraint definition `{}`: not listed in {}.".format(
+ name, self.config.benchmarks.constraints_file
+ )
+ )
return constraint, constraint.name
@lazy_property
@@ -191,11 +213,15 @@ def benchmark_definition(self, name, defaults=None):
:param defaults: defaults used as a base config for each task in the benchmark definition
:return:
"""
- hard_defaults, tasks, benchmark_path, benchmark_name = benchmark_load(name, self.config.benchmarks.definition_dir)
+ hard_defaults, tasks, benchmark_path, benchmark_name = benchmark_load(
+ name, self.config.benchmarks.definition_dir
+ )
- defaults = Namespace.merge(defaults, hard_defaults, Namespace(name='__defaults__'))
+ defaults = Namespace.merge(
+ defaults, hard_defaults, Namespace(name="__defaults__")
+ )
for task in tasks:
- task |= defaults # add missing keys from hard defaults + defaults
+ task |= defaults # add missing keys from hard defaults + defaults
self._validate_task(task)
self._validate_task(defaults, lenient=True)
@@ -206,66 +232,98 @@ def benchmark_definition(self, name, defaults=None):
def _validate_task(self, task, lenient=False):
missing = []
- for conf in ['name']:
+ for conf in ["name"]:
if task[conf] is None:
missing.append(conf)
if not lenient and len(missing) > 0:
- raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task))
-
- for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']:
+ raise ValueError(
+ "{missing} mandatory properties as missing in task definition {taskdef}.".format(
+ missing=missing, taskdef=task
+ )
+ )
+
+ for conf in [
+ "max_runtime_seconds",
+ "cores",
+ "folds",
+ "max_mem_size_mb",
+ "min_vol_size_mb",
+ "quantile_levels",
+ ]:
if task[conf] is None:
task[conf] = self.config.benchmarks.defaults[conf]
- log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
+ log.debug(
+ "Config `{config}` not set for task {name}, using default `{value}`.".format(
+ config=conf, name=task.name, value=task[conf]
+ )
+ )
- conf = 'id'
+ conf = "id"
if task[conf] is None:
- task[conf] = ("openml.org/t/{}".format(task.openml_task_id) if task['openml_task_id'] is not None
- else "openml.org/d/{}".format(task.openml_dataset_id) if task['openml_dataset_id'] is not None
- else ((task.dataset['id'] if isinstance(task.dataset, (dict, Namespace))
- else task.dataset if isinstance(task.dataset, str)
- else None) or task.name) if task['dataset'] is not None
- else None)
+ task[conf] = (
+ "openml.org/t/{}".format(task.openml_task_id)
+ if task["openml_task_id"] is not None
+ else "openml.org/d/{}".format(task.openml_dataset_id)
+ if task["openml_dataset_id"] is not None
+ else (
+ (
+ task.dataset["id"]
+ if isinstance(task.dataset, (dict, Namespace))
+ else task.dataset
+ if isinstance(task.dataset, str)
+ else None
+ )
+ or task.name
+ )
+ if task["dataset"] is not None
+ else None
+ )
if not lenient and task[conf] is None:
- raise ValueError("task definition must contain an ID or one property "
- "among ['openml_task_id', 'dataset'] to create an ID, "
- "but task definition is {task}".format(task=str(task)))
+ raise ValueError(
+ "task definition must contain an ID or one property "
+ "among ['openml_task_id', 'dataset'] to create an ID, "
+ "but task definition is {task}".format(task=str(task))
+ )
- conf = 'metric'
+ conf = "metric"
if task[conf] is None:
task[conf] = None
- conf = 'ec2_instance_type'
+ conf = "ec2_instance_type"
if task[conf] is None:
i_series = self.config.aws.ec2.instance_type.series
i_map = self.config.aws.ec2.instance_type.map
if str(task.cores) in i_map:
i_size = i_map[str(task.cores)]
elif task.cores > 0:
- supported_cores = list(map(int, Namespace.dict(i_map).keys() - {'default'}))
+ supported_cores = list(
+ map(int, Namespace.dict(i_map).keys() - {"default"})
+ )
supported_cores.sort()
- cores = next((c for c in supported_cores if c >= task.cores), 'default')
+ cores = next((c for c in supported_cores if c >= task.cores), "default")
i_size = i_map[str(cores)]
else:
i_size = i_map.default
- task[conf] = '.'.join([i_series, i_size])
- log.debug("Config `{config}` not set for task {name}, using default selection `{value}`.".format(config=conf, name=task.name, value=task[conf]))
-
- conf = 'ec2_volume_type'
+ task[conf] = ".".join([i_series, i_size])
+ log.debug(
+ "Config `{config}` not set for task {name}, using default selection `{value}`.".format(
+ config=conf, name=task.name, value=task[conf]
+ )
+ )
+
+ conf = "ec2_volume_type"
if task[conf] is None:
task[conf] = self.config.aws.ec2.volume_type
- log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
+ log.debug(
+ "Config `{config}` not set for task {name}, using default `{value}`.".format(
+ config=conf, name=task.name, value=task[conf]
+ )
+ )
__INSTANCE__: Resources | None = None
-def from_config(config: Namespace):
- global __INSTANCE__
- transform_config(config, _backward_compatibility_config_rules_)
- __INSTANCE__ = Resources(config)
- return __INSTANCE__
-
-
def from_configs(*configs: Namespace):
global __INSTANCE__
for c in configs:
@@ -286,18 +344,17 @@ def config():
def output_dirs(root, session=None, subdirs=None, create=False):
- root = root if root is not None else '.'
+ root = root if root is not None else "."
if create and not os.path.exists(root):
touch(root, as_dir=True)
dirs = Namespace(
- root=root,
- session=os.path.join(root, session) if session is not None else root
+ root=root, session=os.path.join(root, session) if session is not None else root
)
- subdirs = ([] if subdirs is None
- else [subdirs] if isinstance(subdirs, str)
- else subdirs)
+ subdirs = (
+ [] if subdirs is None else [subdirs] if isinstance(subdirs, str) else subdirs
+ )
for d in subdirs:
dirs[d] = os.path.join(dirs.session, d)
@@ -307,11 +364,22 @@ def output_dirs(root, session=None, subdirs=None, create=False):
_backward_compatibility_config_rules_ = [
- TransformRule(from_key='exit_on_error', to_key='job_scheduler.exit_on_job_failure'),
- TransformRule(from_key='parallel_jobs', to_key='job_scheduler.parallel_jobs'),
- TransformRule(from_key='max_parallel_jobs', to_key='job_scheduler.max_parallel_jobs'),
- TransformRule(from_key='delay_between_jobs', to_key='job_scheduler.delay_between_jobs'),
- TransformRule(from_key='monitoring.frequency_seconds', to_key='monitoring.interval_seconds'),
- TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'),
- TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'),
+ TransformRule(from_key="exit_on_error", to_key="job_scheduler.exit_on_job_failure"),
+ TransformRule(from_key="parallel_jobs", to_key="job_scheduler.parallel_jobs"),
+ TransformRule(
+ from_key="max_parallel_jobs", to_key="job_scheduler.max_parallel_jobs"
+ ),
+ TransformRule(
+ from_key="delay_between_jobs", to_key="job_scheduler.delay_between_jobs"
+ ),
+ TransformRule(
+ from_key="monitoring.frequency_seconds", to_key="monitoring.interval_seconds"
+ ),
+ TransformRule(
+ from_key="aws.query_frequency_seconds", to_key="aws.query_interval_seconds"
+ ),
+ TransformRule(
+ from_key="aws.ec2.monitoring.cpu.query_frequency_seconds",
+ to_key="aws.ec2.monitoring.cpu.query_interval_seconds",
+ ),
]
diff --git a/runscores.py b/runscores.py
deleted file mode 100644
index 44c868c72..000000000
--- a/runscores.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import argparse
-import os
-
-# prevent asap other modules from defining the root logger using basicConfig
-import amlb.logger
-
-from ruamel import yaml
-
-import amlb
-from amlb import log
-from amlb.utils import config_load
-
-root_dir = os.path.dirname(__file__)
-
-parser = argparse.ArgumentParser()
-parser.add_argument('predictions', type=str,
- help='The predictions file to load and compute the scores for.')
-args = parser.parse_args()
-
-# script_name = os.path.splitext(os.path.basename(__file__))[0]
-# log_dir = os.path.join(args.outdir if args.outdir else '.', 'logs')
-# os.makedirs(log_dir, exist_ok=True)
-# now_str = datetime_iso(date_sep='', time_sep='')
-amlb.logger.setup(root_level='DEBUG', console_level='INFO')
-
-config = config_load("resources/config.yaml")
-config.run_mode = 'script'
-config.root_dir = root_dir
-config.script = os.path.basename(__file__)
-amlb.resources.from_config(config)
-
-scores = amlb.TaskResult.score_from_predictions_file(args.predictions)
-log.info("\n\nScores computed from %s:\n%s", args.predictions, yaml.dump(dict(scores), default_flow_style=False))
-
diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py
index b46379724..ccce03e0b 100644
--- a/tests/unit/amlb/datasets/file/test_file_dataloader.py
+++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py
@@ -6,18 +6,18 @@
import pytest
import pandas.api.types as pat
-from amlb.resources import from_config
+from amlb.resources import from_configs
from amlb.data import DatasetType
from amlb.datasets.file import FileLoader
from amlb.utils import Namespace as ns, path_from_split, split_path
here = os.path.realpath(os.path.dirname(__file__))
-res = os.path.join(here, 'resources')
+res = os.path.join(here, "resources")
@pytest.fixture(autouse=True)
def file_config():
- return from_config(
+ return from_configs(
ns(
input_dir="my_input",
output_dir="my_output",
@@ -37,7 +37,7 @@ def test_load_binary_task_csv(file_loader):
ds_def = ns(
train=os.path.join(res, "kc2_train.csv"),
test=os.path.join(res, "kc2_test.csv"),
- target="problems"
+ target="problems",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.binary
@@ -47,13 +47,12 @@ def test_load_binary_task_csv(file_loader):
_assert_kc2_features(ds, ds_def)
-
@pytest.mark.use_disk
def test_load_binary_task_arff(file_loader):
ds_def = ns(
train=os.path.join(res, "kc2_train.arff"),
test=os.path.join(res, "kc2_test.arff"),
- target="problems"
+ target="problems",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.binary
@@ -69,20 +68,36 @@ def _assert_kc2_features(dataset, definition):
target_values = ["no", "yes"]
_assert_target(dataset.target, name=definition.target, values=target_values)
- assert all([p.data_type in ['int', 'float'] for p in dataset.predictors])
+ assert all([p.data_type in ["int", "float"] for p in dataset.predictors])
assert all([p.values is None for p in dataset.predictors])
assert not any([p.is_target for p in dataset.predictors])
assert not any([p.has_missing_values for p in dataset.predictors])
- floats = [p.name for p in dataset.predictors if p.data_type == 'float']
- ints = [p.name for p in dataset.predictors if p.data_type == 'int']
- assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all()
- assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all()
+ floats = [p.name for p in dataset.predictors if p.data_type == "float"]
+ ints = [p.name for p in dataset.predictors if p.data_type == "int"]
+ assert (
+ dataset.train.X.dtypes.filter(items=floats)
+ .apply(lambda dt: pd.api.types.is_float_dtype(dt))
+ .all()
+ )
+ assert (
+ dataset.train.X.dtypes.filter(items=ints)
+ .apply(lambda dt: pd.api.types.is_integer_dtype(dt))
+ .all()
+ )
assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0])
normalize = dataset.target.normalize
- assert list(normalize(dataset.train.y.squeeze().unique())) == list(normalize(dataset.test.y.squeeze().unique())) == target_values
- assert list(np.unique(dataset.train.y_enc)) == list(np.unique(dataset.test.y_enc)) == [0, 1]
+ assert (
+ list(normalize(dataset.train.y.squeeze().unique()))
+ == list(normalize(dataset.test.y.squeeze().unique()))
+ == target_values
+ )
+ assert (
+ list(np.unique(dataset.train.y_enc))
+ == list(np.unique(dataset.test.y_enc))
+ == [0, 1]
+ )
@pytest.mark.use_disk
@@ -90,7 +105,7 @@ def test_load_multiclass_task_csv(file_loader):
ds_def = ns(
train=os.path.join(res, "iris_train.csv"),
test=os.path.join(res, "iris_test.csv"),
- target="class"
+ target="class",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.multiclass
@@ -105,10 +120,12 @@ def test_load_multiclass_task_with_num_target_no_type_csv(file_loader):
ds_def = ns(
train=os.path.join(res, "iris_num_train.csv"),
test=os.path.join(res, "iris_num_test.csv"),
- target="class"
+ target="class",
)
ds = file_loader.load(ds_def)
- assert ds.type is DatasetType.regression, "file loader should detect num target as regression by default"
+ assert (
+ ds.type is DatasetType.regression
+ ), "file loader should detect num target as regression by default"
@pytest.mark.use_disk
@@ -117,7 +134,7 @@ def test_load_multiclass_task_with_num_target_csv(file_loader):
train=os.path.join(res, "iris_num_train.csv"),
test=os.path.join(res, "iris_num_test.csv"),
target="class",
- type="multiclass"
+ type="multiclass",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.multiclass
@@ -132,7 +149,7 @@ def test_load_multiclass_task_arff(file_loader):
ds_def = ns(
train=os.path.join(res, "iris_train.arff"),
test=os.path.join(res, "iris_test.arff"),
- target="class"
+ target="class",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.multiclass
@@ -145,23 +162,43 @@ def test_load_multiclass_task_arff(file_loader):
def _assert_iris_features(dataset, definition, num_target=False):
assert len(dataset.features) == 5
assert len(dataset.predictors) == 4
- target_values = ["1", "2", "3"] if num_target else ["iris-setosa", "iris-versicolor", "iris-virginica"] # values are normalized
+ target_values = (
+ ["1", "2", "3"]
+ if num_target
+ else ["iris-setosa", "iris-versicolor", "iris-virginica"]
+ ) # values are normalized
_assert_target(dataset.target, name=definition.target, values=target_values)
- assert all([p.data_type in ['int', 'float'] for p in dataset.predictors])
+ assert all([p.data_type in ["int", "float"] for p in dataset.predictors])
assert all([p.values is None for p in dataset.predictors])
assert not any([p.is_target for p in dataset.predictors])
assert not any([p.has_missing_values for p in dataset.predictors])
- floats = [p.name for p in dataset.predictors if p.data_type == 'float']
- ints = [p.name for p in dataset.predictors if p.data_type == 'int']
- assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all()
- assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all()
+ floats = [p.name for p in dataset.predictors if p.data_type == "float"]
+ ints = [p.name for p in dataset.predictors if p.data_type == "int"]
+ assert (
+ dataset.train.X.dtypes.filter(items=floats)
+ .apply(lambda dt: pd.api.types.is_float_dtype(dt))
+ .all()
+ )
+ assert (
+ dataset.train.X.dtypes.filter(items=ints)
+ .apply(lambda dt: pd.api.types.is_integer_dtype(dt))
+ .all()
+ )
assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0])
normalize = dataset.target.normalize
- assert list(normalize(dataset.train.y.squeeze().unique())) == list(normalize(dataset.test.y.squeeze().unique())) == target_values
- assert list(np.unique(dataset.train.y_enc)) == list(np.unique(dataset.test.y_enc)) == [0, 1, 2]
+ assert (
+ list(normalize(dataset.train.y.squeeze().unique()))
+ == list(normalize(dataset.test.y.squeeze().unique()))
+ == target_values
+ )
+ assert (
+ list(np.unique(dataset.train.y_enc))
+ == list(np.unique(dataset.test.y_enc))
+ == [0, 1, 2]
+ )
@pytest.mark.use_disk
@@ -169,7 +206,7 @@ def test_load_regression_task_csv(file_loader):
ds_def = ns(
train=os.path.join(res, "cholesterol_train.csv"),
test=os.path.join(res, "cholesterol_test.csv"),
- target="chol"
+ target="chol",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.regression
@@ -177,7 +214,7 @@ def test_load_regression_task_csv(file_loader):
_assert_X_y_types(ds.train)
_assert_data_consistency(ds)
_assert_data_paths(ds, ds_def)
- _assert_cholesterol_features(ds, ds_def, 'csv')
+ _assert_cholesterol_features(ds, ds_def, "csv")
@pytest.mark.use_disk
@@ -185,7 +222,7 @@ def test_load_regression_task_arff(file_loader):
ds_def = ns(
train=os.path.join(res, "cholesterol_train.arff"),
test=os.path.join(res, "cholesterol_test.arff"),
- target="chol"
+ target="chol",
)
ds = file_loader.load(ds_def)
assert ds.type is DatasetType.regression
@@ -193,7 +230,7 @@ def test_load_regression_task_arff(file_loader):
_assert_X_y_types(ds.train)
_assert_data_consistency(ds)
_assert_data_paths(ds, ds_def)
- _assert_cholesterol_features(ds, ds_def, 'arff')
+ _assert_cholesterol_features(ds, ds_def, "arff")
def _assert_cholesterol_features(dataset, definition, fmt):
@@ -201,29 +238,45 @@ def _assert_cholesterol_features(dataset, definition, fmt):
assert len(dataset.predictors) == 13
_assert_target(dataset.target, name=definition.target)
- ints = [p.name for p in dataset.predictors if p.data_type == 'int']
- floats = [p.name for p in dataset.predictors if p.data_type == 'float']
- categoricals = [p.name for p in dataset.predictors if p.data_type == 'category']
+ ints = [p.name for p in dataset.predictors if p.data_type == "int"]
+ floats = [p.name for p in dataset.predictors if p.data_type == "float"]
+ categoricals = [p.name for p in dataset.predictors if p.data_type == "category"]
- assert len(ints) == (0 if fmt == 'arff' else 6)
- assert len(floats) == (6 if fmt == 'arff' else 7)
- assert len(categoricals) == (7 if fmt == 'arff' else 0)
+ assert len(ints) == (0 if fmt == "arff" else 6)
+ assert len(floats) == (6 if fmt == "arff" else 7)
+ assert len(categoricals) == (7 if fmt == "arff" else 0)
assert not any([p.is_target for p in dataset.predictors])
assert len([p for p in dataset.predictors if p.has_missing_values]) == 2
- assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all()
- assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all()
- assert dataset.train.X.dtypes.filter(items=categoricals).apply(lambda dt: pd.api.types.is_categorical_dtype(dt)).all()
+ assert (
+ dataset.train.X.dtypes.filter(items=ints)
+ .apply(lambda dt: pd.api.types.is_integer_dtype(dt))
+ .all()
+ )
+ assert (
+ dataset.train.X.dtypes.filter(items=floats)
+ .apply(lambda dt: pd.api.types.is_float_dtype(dt))
+ .all()
+ )
+ assert (
+ dataset.train.X.dtypes.filter(items=categoricals)
+ .apply(lambda dt: pd.api.types.is_categorical_dtype(dt))
+ .all()
+ )
assert pd.api.types.is_float_dtype(dataset.train.y.dtypes.iloc[0])
- assert np.array_equal(dataset.train.y_enc, dataset.train.y.squeeze().to_numpy()), "no encoding should have been applied on regression target"
- assert np.array_equal(dataset.test.y_enc, dataset.test.y.squeeze().to_numpy()), "no encoding should have been applied on regression target"
+ assert np.array_equal(
+ dataset.train.y_enc, dataset.train.y.squeeze().to_numpy()
+ ), "no encoding should have been applied on regression target"
+ assert np.array_equal(
+ dataset.test.y_enc, dataset.test.y.squeeze().to_numpy()
+ ), "no encoding should have been applied on regression target"
def _assert_target(target, name, values=None):
assert target.name == name
assert target.values == values
- assert target.data_type == 'category' if values else 'float'
+ assert target.data_type == "category" if values else "float"
assert target.is_target
assert not target.has_missing_values
@@ -233,7 +286,7 @@ def _assert_data_paths(dataset, definition):
assert dataset.test.path == definition.test
sp = split_path(definition.train)
fmt = sp.extension[1:]
- for f in ['arff', 'csv', 'parquet']:
+ for f in ["arff", "csv", "parquet"]:
if f == fmt:
assert dataset.train.data_path(f) == dataset.train.path
else:
@@ -259,15 +312,15 @@ def _assert_data_consistency(dataset, check_encoded=True):
assert not any([p.is_target for p in dataset.predictors])
-
assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes)
assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes)
if check_encoded:
assert dataset.train.X_enc.shape == dataset.train.X.shape
assert np.issubdtype(dataset.train.X_enc.dtype, np.floating)
- assert np.issubdtype(dataset.train.y_enc.dtype, np.floating) # not ideal given that it's also for classification targets, but well…
-
+ assert np.issubdtype(
+ dataset.train.y_enc.dtype, np.floating
+ ) # not ideal given that it's also for classification targets, but well…
@pytest.mark.use_disk
@@ -297,13 +350,17 @@ def test_load_timeseries_task_csv(file_loader):
assert pat.is_float_dtype(ds._dtypes[ds.target.name])
# timeseries uses different task schema - set attributes for test to work
- ds_def['train'] = ds.train.path
- ds_def['test'] = ds.test.path
+ ds_def["train"] = ds.train.path
+ ds_def["test"] = ds.test.path
_assert_data_paths(ds, ds_def)
-@pytest.mark.parametrize("missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"])
-def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loader, missing_key):
+@pytest.mark.parametrize(
+ "missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"]
+)
+def test_when_timeseries_task_key_is_missing_then_exception_is_raised(
+ file_loader, missing_key
+):
task_kwargs = dict(
path=os.path.join(res, "m4_hourly_subset.csv"),
forecast_horizon_in_steps=24,
@@ -314,12 +371,17 @@ def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loade
)
task_kwargs.pop(missing_key)
ds_def = ns.from_dict(task_kwargs)
- with pytest.raises(AssertionError, match=f"Task definition for timeseries must include `{missing_key}`"):
+ with pytest.raises(
+ AssertionError,
+ match=f"Task definition for timeseries must include `{missing_key}`",
+ ):
file_loader.load(ds_def)
@pytest.mark.parametrize("missing_key", ["id_column", "timestamp_column"])
-def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised(file_loader, missing_key):
+def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised(
+ file_loader, missing_key
+):
task_kwargs = dict(
path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"),
forecast_horizon_in_steps=24,
@@ -336,7 +398,9 @@ def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_rai
file_loader.load(ds_def)
-def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(file_loader):
+def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(
+ file_loader,
+):
task_kwargs = dict(
path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"),
forecast_horizon_in_steps=24,
@@ -353,7 +417,9 @@ def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(fil
@pytest.mark.parametrize("forecast_horizon, fold", [(50, 2), (100, 0), (10, 9)])
-def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised(file_loader, forecast_horizon, fold):
+def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised(
+ file_loader, forecast_horizon, fold
+):
ds_def = ns(
path=os.path.join(res, "m4_hourly_subset.csv"),
forecast_horizon_in_steps=forecast_horizon,
@@ -361,5 +427,7 @@ def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_ra
freq="H",
type="timeseries",
)
- with pytest.raises(ValueError, match="All time series in the dataset must have length"):
+ with pytest.raises(
+ ValueError, match="All time series in the dataset must have length"
+ ):
file_loader.load(ds_def, fold=fold)
diff --git a/tests/unit/amlb/datasets/openml/test_openml_dataloader.py b/tests/unit/amlb/datasets/openml/test_openml_dataloader.py
index 3c496d462..c3340f9a9 100644
--- a/tests/unit/amlb/datasets/openml/test_openml_dataloader.py
+++ b/tests/unit/amlb/datasets/openml/test_openml_dataloader.py
@@ -5,7 +5,7 @@
import pandas as pd
import pytest
-from amlb.resources import from_config
+from amlb.resources import from_configs
from amlb.data import DatasetType
from amlb.datasets.openml import OpenmlLoader
from amlb.utils import Namespace as ns
@@ -13,17 +13,13 @@
@pytest.fixture
def oml_config():
- return from_config(
+ return from_configs(
ns(
input_dir="my_input",
output_dir="my_output",
user_dir="my_user_dir",
root_dir="my_root_dir",
-
- openml=ns(
- apikey="c1994bdb7ecb3c6f3c8f3b35f4b47f1f",
- infer_dtypes=False
- )
+ openml=ns(apikey="c1994bdb7ecb3c6f3c8f3b35f4b47f1f", infer_dtypes=False),
)
).config
@@ -51,13 +47,15 @@ def _assert_kc2_features(dataset):
_assert_target(dataset.target, "problems", ["no", "yes"])
- assert all([p.data_type == 'number' for p in dataset.predictors])
+ assert all([p.data_type == "number" for p in dataset.predictors])
assert all([p.values is None for p in dataset.predictors])
assert not any([p.has_missing_values for p in dataset.predictors])
- assert dataset.train.X.dtypes.apply(lambda dt: pd.api.types.is_numeric_dtype(dt)).all()
- assert len(dataset.train.X.select_dtypes(include=['float']).columns) == 18
- assert len(dataset.train.X.select_dtypes(include=['uint8']).columns) == 3
+ assert dataset.train.X.dtypes.apply(
+ lambda dt: pd.api.types.is_numeric_dtype(dt)
+ ).all()
+ assert len(dataset.train.X.select_dtypes(include=["float"]).columns) == 18
+ assert len(dataset.train.X.select_dtypes(include=["uint8"]).columns) == 3
assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0])
@@ -77,13 +75,17 @@ def _assert_iris_features(dataset):
assert len(dataset.features) == 5
assert len(dataset.predictors) == 4
- _assert_target(dataset.target, "class", ["iris-setosa", "iris-versicolor", "iris-virginica"])
+ _assert_target(
+ dataset.target, "class", ["iris-setosa", "iris-versicolor", "iris-virginica"]
+ )
- assert all([p.data_type == 'number' for p in dataset.predictors])
+ assert all([p.data_type == "number" for p in dataset.predictors])
assert all([p.values is None for p in dataset.predictors])
assert not any([p.has_missing_values for p in dataset.predictors])
- assert dataset.train.X.dtypes.apply(lambda dt: pd.api.types.is_float_dtype(dt)).all()
+ assert dataset.train.X.dtypes.apply(
+ lambda dt: pd.api.types.is_float_dtype(dt)
+ ).all()
assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0])
@@ -105,33 +107,53 @@ def _assert_cholesterol_features(dataset):
_assert_target(dataset.target, "chol")
- numericals = [p.name for p in dataset.predictors if p.data_type == 'number']
- categoricals = [p.name for p in dataset.predictors if p.data_type == 'category']
+ numericals = [p.name for p in dataset.predictors if p.data_type == "number"]
+ categoricals = [p.name for p in dataset.predictors if p.data_type == "category"]
assert len(numericals) == 6
assert len(categoricals) == 7
assert len([p for p in dataset.predictors if p.has_missing_values]) == 2
- assert dataset.train.X.dtypes.filter(items=numericals).apply(lambda dt: pd.api.types.is_numeric_dtype(dt)).all()
- assert dataset.train.X.dtypes.filter(items=categoricals).apply(lambda dt: pd.api.types.is_categorical_dtype(dt)).all()
- assert len(dataset.train.X.select_dtypes(include=['float']).columns) == 6
+ assert (
+ dataset.train.X.dtypes.filter(items=numericals)
+ .apply(lambda dt: pd.api.types.is_numeric_dtype(dt))
+ .all()
+ )
+ assert (
+ dataset.train.X.dtypes.filter(items=categoricals)
+ .apply(lambda dt: pd.api.types.is_categorical_dtype(dt))
+ .all()
+ )
+ assert len(dataset.train.X.select_dtypes(include=["float"]).columns) == 6
assert pd.api.types.is_float_dtype(dataset.train.y.dtypes.iloc[0])
def _assert_target(target, name, values=None):
assert target.name == name
assert target.values == values
- assert target.data_type == 'category' if values else 'number'
+ assert target.data_type == "category" if values else "number"
assert target.is_target
assert not target.has_missing_values
def _assert_data_paths(dataset, ds_id, fold):
- assert dataset.train.path.endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.arff"))
- assert dataset.test.path.endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.arff"))
- assert dataset.train.data_path('csv').endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.csv"))
- assert dataset.test.data_path('csv').endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.csv"))
- assert dataset.train.data_path('parquet').endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.parquet"))
- assert dataset.test.data_path('parquet').endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.parquet"))
+ assert dataset.train.path.endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.arff")
+ )
+ assert dataset.test.path.endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.arff")
+ )
+ assert dataset.train.data_path("csv").endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.csv")
+ )
+ assert dataset.test.data_path("csv").endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.csv")
+ )
+ assert dataset.train.data_path("parquet").endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.parquet")
+ )
+ assert dataset.test.data_path("parquet").endswith(
+ os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.parquet")
+ )
def _assert_X_y_types(data_split):
@@ -155,6 +177,7 @@ def _assert_data_consistency(dataset):
assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes)
assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes)
- assert np.issubdtype(dataset.train.X_enc.dtype, np.floating) # all categorical features are directly encoded as float
+ assert np.issubdtype(
+ dataset.train.X_enc.dtype, np.floating
+ ) # all categorical features are directly encoded as float
assert np.issubdtype(dataset.train.y_enc.dtype, np.floating)
-