From a5d192c77daedbb96648fdcf31f0bf3239ae134d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 27 Nov 2024 18:25:43 +0200 Subject: [PATCH] Remove from_config and only keep from_configs --- amlb/resources.py | 226 ++++++++++++------ runscores.py | 34 --- .../datasets/file/test_file_dataloader.py | 178 +++++++++----- .../datasets/openml/test_openml_dataloader.py | 79 +++--- 4 files changed, 321 insertions(+), 196 deletions(-) delete mode 100644 runscores.py diff --git a/amlb/resources.py b/amlb/resources.py index 617a4d609..0e37b240d 100644 --- a/amlb/resources.py +++ b/amlb/resources.py @@ -2,6 +2,7 @@ **resources** modules exposes a singleton ``Resources`` instance providing easy access to app configuration properties, as well as handy methods to access other resources like *automl frameworks* and *benchmark definitions* """ + from __future__ import annotations import copy @@ -13,7 +14,15 @@ from amlb.benchmarks.parser import benchmark_load from amlb.frameworks import default_tag, load_framework_definitions -from .utils import Namespace, lazy_property, memoize, normalize_path, run_cmd, str_sanitize, touch +from .utils import ( + Namespace, + lazy_property, + memoize, + normalize_path, + run_cmd, + str_sanitize, + touch, +) from .utils.config import TransformRule, config_load, transform_config from .__version__ import __version__, _dev_version as dev @@ -22,7 +31,6 @@ class Resources: - @staticmethod def _normalize(config: Namespace, replace=None): def nz_path(path): @@ -34,8 +42,10 @@ def nz_path(path): for k, v in config: if isinstance(v, Namespace): normalized[k] = Resources._normalize(v, replace=replace) - elif re.search(r'_(dir|file|cmd)s?$', k): - normalized[k] = [nz_path(p) for p in v] if isinstance(v, list) else nz_path(v) + elif re.search(r"_(dir|file|cmd)s?$", k): + normalized[k] = ( + [nz_path(p) for p in v] if isinstance(v, list) else nz_path(v) + ) return normalized def __init__(self, config: Namespace): @@ -51,20 +61,16 @@ def __init__(self, config: Namespace): log.debug("Using config:\n%s", self.config) # allowing to load custom modules from user directory - sys.path.append(common_dirs['user']) + sys.path.append(common_dirs["user"]) log.debug("Extended Python sys.path to user directory: %s.", sys.path) @lazy_property def project_info(self): - split_url = self.config.project_repository.split('#', 1) + split_url = self.config.project_repository.split("#", 1) repo = split_url[0] tag = None if len(split_url) == 1 else split_url[1] - branch = tag or 'master' - return Namespace( - repo=repo, - tag=tag, - branch=branch - ) + branch = tag or "master" + return Namespace(repo=repo, tag=tag, branch=branch) @lazy_property def git_info(self): @@ -88,11 +94,7 @@ def git(cmd, defval=None): repo = branch = commit = na tags = status = [] return Namespace( - repo=repo, - branch=branch, - commit=commit, - tags=tags, - status=status + repo=repo, branch=branch, commit=commit, tags=tags, status=status ) @lazy_property @@ -109,17 +111,19 @@ def app_version(self): return "{v} [{details}]".format(v=v, details=", ".join(tokens)) def seed(self, fold=None): - if isinstance(fold, int) and str(self.config.seed).lower() in ['auto']: - return fold+self._seed + if isinstance(fold, int) and str(self.config.seed).lower() in ["auto"]: + return fold + self._seed else: return self._seed @lazy_property def _seed(self): - if str(self.config.seed).lower() in ['none', '']: + if str(self.config.seed).lower() in ["none", ""]: return None - elif str(self.config.seed).lower() in ['auto']: - return random.randint(1, (1 << 31) - 1) # limiting seed to signed int32 for R frameworks + elif str(self.config.seed).lower() in ["auto"]: + return random.randint( + 1, (1 << 31) - 1 + ) # limiting seed to signed int32 for R frameworks else: return self.config.seed @@ -132,20 +136,34 @@ def framework_definition(self, name, tag=None): if tag is None: tag = default_tag if tag not in self._frameworks: - raise ValueError("Incorrect tag `{}`: only those among {} are allowed.".format(tag, self.config.frameworks.tags)) + raise ValueError( + "Incorrect tag `{}`: only those among {} are allowed.".format( + tag, self.config.frameworks.tags + ) + ) frameworks = self._frameworks[tag] log.debug("Available framework definitions:\n%s", frameworks) framework = next((f for n, f in frameworks if n.lower() == lname), None) # TODO: Clean up this workflow and error messaging as part of #518 - base_framework = next((f for n, f in self._frameworks[default_tag] if n.lower() == lname), None) - if framework and framework['removed']: - raise ValueError(f"Framework definition `{name}` has been removed from the benchmark: {framework['removed']}") - if not framework and (base_framework and base_framework['removed']): - raise ValueError(f"Framework definition `{name}` has been removed from the benchmark: {base_framework['removed']}") + base_framework = next( + (f for n, f in self._frameworks[default_tag] if n.lower() == lname), None + ) + if framework and framework["removed"]: + raise ValueError( + f"Framework definition `{name}` has been removed from the benchmark: {framework['removed']}" + ) + if not framework and (base_framework and base_framework["removed"]): + raise ValueError( + f"Framework definition `{name}` has been removed from the benchmark: {base_framework['removed']}" + ) if not framework: - raise ValueError(f"Incorrect framework `{name}`: not listed in {self.config.frameworks.definition_file}.") - if framework['abstract']: - raise ValueError(f"Framework definition `{name}` is abstract and cannot be run directly.") + raise ValueError( + f"Incorrect framework `{name}`: not listed in {self.config.frameworks.definition_file}." + ) + if framework["abstract"]: + raise ValueError( + f"Framework definition `{name}` is abstract and cannot be run directly." + ) return framework, framework.name @lazy_property @@ -161,7 +179,11 @@ def constraint_definition(self, name): """ constraint = self._constraints[name.lower()] if not constraint: - raise ValueError("Incorrect constraint definition `{}`: not listed in {}.".format(name, self.config.benchmarks.constraints_file)) + raise ValueError( + "Incorrect constraint definition `{}`: not listed in {}.".format( + name, self.config.benchmarks.constraints_file + ) + ) return constraint, constraint.name @lazy_property @@ -191,11 +213,15 @@ def benchmark_definition(self, name, defaults=None): :param defaults: defaults used as a base config for each task in the benchmark definition :return: """ - hard_defaults, tasks, benchmark_path, benchmark_name = benchmark_load(name, self.config.benchmarks.definition_dir) + hard_defaults, tasks, benchmark_path, benchmark_name = benchmark_load( + name, self.config.benchmarks.definition_dir + ) - defaults = Namespace.merge(defaults, hard_defaults, Namespace(name='__defaults__')) + defaults = Namespace.merge( + defaults, hard_defaults, Namespace(name="__defaults__") + ) for task in tasks: - task |= defaults # add missing keys from hard defaults + defaults + task |= defaults # add missing keys from hard defaults + defaults self._validate_task(task) self._validate_task(defaults, lenient=True) @@ -206,66 +232,98 @@ def benchmark_definition(self, name, defaults=None): def _validate_task(self, task, lenient=False): missing = [] - for conf in ['name']: + for conf in ["name"]: if task[conf] is None: missing.append(conf) if not lenient and len(missing) > 0: - raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task)) - - for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']: + raise ValueError( + "{missing} mandatory properties as missing in task definition {taskdef}.".format( + missing=missing, taskdef=task + ) + ) + + for conf in [ + "max_runtime_seconds", + "cores", + "folds", + "max_mem_size_mb", + "min_vol_size_mb", + "quantile_levels", + ]: if task[conf] is None: task[conf] = self.config.benchmarks.defaults[conf] - log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf])) + log.debug( + "Config `{config}` not set for task {name}, using default `{value}`.".format( + config=conf, name=task.name, value=task[conf] + ) + ) - conf = 'id' + conf = "id" if task[conf] is None: - task[conf] = ("openml.org/t/{}".format(task.openml_task_id) if task['openml_task_id'] is not None - else "openml.org/d/{}".format(task.openml_dataset_id) if task['openml_dataset_id'] is not None - else ((task.dataset['id'] if isinstance(task.dataset, (dict, Namespace)) - else task.dataset if isinstance(task.dataset, str) - else None) or task.name) if task['dataset'] is not None - else None) + task[conf] = ( + "openml.org/t/{}".format(task.openml_task_id) + if task["openml_task_id"] is not None + else "openml.org/d/{}".format(task.openml_dataset_id) + if task["openml_dataset_id"] is not None + else ( + ( + task.dataset["id"] + if isinstance(task.dataset, (dict, Namespace)) + else task.dataset + if isinstance(task.dataset, str) + else None + ) + or task.name + ) + if task["dataset"] is not None + else None + ) if not lenient and task[conf] is None: - raise ValueError("task definition must contain an ID or one property " - "among ['openml_task_id', 'dataset'] to create an ID, " - "but task definition is {task}".format(task=str(task))) + raise ValueError( + "task definition must contain an ID or one property " + "among ['openml_task_id', 'dataset'] to create an ID, " + "but task definition is {task}".format(task=str(task)) + ) - conf = 'metric' + conf = "metric" if task[conf] is None: task[conf] = None - conf = 'ec2_instance_type' + conf = "ec2_instance_type" if task[conf] is None: i_series = self.config.aws.ec2.instance_type.series i_map = self.config.aws.ec2.instance_type.map if str(task.cores) in i_map: i_size = i_map[str(task.cores)] elif task.cores > 0: - supported_cores = list(map(int, Namespace.dict(i_map).keys() - {'default'})) + supported_cores = list( + map(int, Namespace.dict(i_map).keys() - {"default"}) + ) supported_cores.sort() - cores = next((c for c in supported_cores if c >= task.cores), 'default') + cores = next((c for c in supported_cores if c >= task.cores), "default") i_size = i_map[str(cores)] else: i_size = i_map.default - task[conf] = '.'.join([i_series, i_size]) - log.debug("Config `{config}` not set for task {name}, using default selection `{value}`.".format(config=conf, name=task.name, value=task[conf])) - - conf = 'ec2_volume_type' + task[conf] = ".".join([i_series, i_size]) + log.debug( + "Config `{config}` not set for task {name}, using default selection `{value}`.".format( + config=conf, name=task.name, value=task[conf] + ) + ) + + conf = "ec2_volume_type" if task[conf] is None: task[conf] = self.config.aws.ec2.volume_type - log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf])) + log.debug( + "Config `{config}` not set for task {name}, using default `{value}`.".format( + config=conf, name=task.name, value=task[conf] + ) + ) __INSTANCE__: Resources | None = None -def from_config(config: Namespace): - global __INSTANCE__ - transform_config(config, _backward_compatibility_config_rules_) - __INSTANCE__ = Resources(config) - return __INSTANCE__ - - def from_configs(*configs: Namespace): global __INSTANCE__ for c in configs: @@ -286,18 +344,17 @@ def config(): def output_dirs(root, session=None, subdirs=None, create=False): - root = root if root is not None else '.' + root = root if root is not None else "." if create and not os.path.exists(root): touch(root, as_dir=True) dirs = Namespace( - root=root, - session=os.path.join(root, session) if session is not None else root + root=root, session=os.path.join(root, session) if session is not None else root ) - subdirs = ([] if subdirs is None - else [subdirs] if isinstance(subdirs, str) - else subdirs) + subdirs = ( + [] if subdirs is None else [subdirs] if isinstance(subdirs, str) else subdirs + ) for d in subdirs: dirs[d] = os.path.join(dirs.session, d) @@ -307,11 +364,22 @@ def output_dirs(root, session=None, subdirs=None, create=False): _backward_compatibility_config_rules_ = [ - TransformRule(from_key='exit_on_error', to_key='job_scheduler.exit_on_job_failure'), - TransformRule(from_key='parallel_jobs', to_key='job_scheduler.parallel_jobs'), - TransformRule(from_key='max_parallel_jobs', to_key='job_scheduler.max_parallel_jobs'), - TransformRule(from_key='delay_between_jobs', to_key='job_scheduler.delay_between_jobs'), - TransformRule(from_key='monitoring.frequency_seconds', to_key='monitoring.interval_seconds'), - TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'), - TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'), + TransformRule(from_key="exit_on_error", to_key="job_scheduler.exit_on_job_failure"), + TransformRule(from_key="parallel_jobs", to_key="job_scheduler.parallel_jobs"), + TransformRule( + from_key="max_parallel_jobs", to_key="job_scheduler.max_parallel_jobs" + ), + TransformRule( + from_key="delay_between_jobs", to_key="job_scheduler.delay_between_jobs" + ), + TransformRule( + from_key="monitoring.frequency_seconds", to_key="monitoring.interval_seconds" + ), + TransformRule( + from_key="aws.query_frequency_seconds", to_key="aws.query_interval_seconds" + ), + TransformRule( + from_key="aws.ec2.monitoring.cpu.query_frequency_seconds", + to_key="aws.ec2.monitoring.cpu.query_interval_seconds", + ), ] diff --git a/runscores.py b/runscores.py deleted file mode 100644 index 44c868c72..000000000 --- a/runscores.py +++ /dev/null @@ -1,34 +0,0 @@ -import argparse -import os - -# prevent asap other modules from defining the root logger using basicConfig -import amlb.logger - -from ruamel import yaml - -import amlb -from amlb import log -from amlb.utils import config_load - -root_dir = os.path.dirname(__file__) - -parser = argparse.ArgumentParser() -parser.add_argument('predictions', type=str, - help='The predictions file to load and compute the scores for.') -args = parser.parse_args() - -# script_name = os.path.splitext(os.path.basename(__file__))[0] -# log_dir = os.path.join(args.outdir if args.outdir else '.', 'logs') -# os.makedirs(log_dir, exist_ok=True) -# now_str = datetime_iso(date_sep='', time_sep='') -amlb.logger.setup(root_level='DEBUG', console_level='INFO') - -config = config_load("resources/config.yaml") -config.run_mode = 'script' -config.root_dir = root_dir -config.script = os.path.basename(__file__) -amlb.resources.from_config(config) - -scores = amlb.TaskResult.score_from_predictions_file(args.predictions) -log.info("\n\nScores computed from %s:\n%s", args.predictions, yaml.dump(dict(scores), default_flow_style=False)) - diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py index b46379724..ccce03e0b 100644 --- a/tests/unit/amlb/datasets/file/test_file_dataloader.py +++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py @@ -6,18 +6,18 @@ import pytest import pandas.api.types as pat -from amlb.resources import from_config +from amlb.resources import from_configs from amlb.data import DatasetType from amlb.datasets.file import FileLoader from amlb.utils import Namespace as ns, path_from_split, split_path here = os.path.realpath(os.path.dirname(__file__)) -res = os.path.join(here, 'resources') +res = os.path.join(here, "resources") @pytest.fixture(autouse=True) def file_config(): - return from_config( + return from_configs( ns( input_dir="my_input", output_dir="my_output", @@ -37,7 +37,7 @@ def test_load_binary_task_csv(file_loader): ds_def = ns( train=os.path.join(res, "kc2_train.csv"), test=os.path.join(res, "kc2_test.csv"), - target="problems" + target="problems", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.binary @@ -47,13 +47,12 @@ def test_load_binary_task_csv(file_loader): _assert_kc2_features(ds, ds_def) - @pytest.mark.use_disk def test_load_binary_task_arff(file_loader): ds_def = ns( train=os.path.join(res, "kc2_train.arff"), test=os.path.join(res, "kc2_test.arff"), - target="problems" + target="problems", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.binary @@ -69,20 +68,36 @@ def _assert_kc2_features(dataset, definition): target_values = ["no", "yes"] _assert_target(dataset.target, name=definition.target, values=target_values) - assert all([p.data_type in ['int', 'float'] for p in dataset.predictors]) + assert all([p.data_type in ["int", "float"] for p in dataset.predictors]) assert all([p.values is None for p in dataset.predictors]) assert not any([p.is_target for p in dataset.predictors]) assert not any([p.has_missing_values for p in dataset.predictors]) - floats = [p.name for p in dataset.predictors if p.data_type == 'float'] - ints = [p.name for p in dataset.predictors if p.data_type == 'int'] - assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all() - assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all() + floats = [p.name for p in dataset.predictors if p.data_type == "float"] + ints = [p.name for p in dataset.predictors if p.data_type == "int"] + assert ( + dataset.train.X.dtypes.filter(items=floats) + .apply(lambda dt: pd.api.types.is_float_dtype(dt)) + .all() + ) + assert ( + dataset.train.X.dtypes.filter(items=ints) + .apply(lambda dt: pd.api.types.is_integer_dtype(dt)) + .all() + ) assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0]) normalize = dataset.target.normalize - assert list(normalize(dataset.train.y.squeeze().unique())) == list(normalize(dataset.test.y.squeeze().unique())) == target_values - assert list(np.unique(dataset.train.y_enc)) == list(np.unique(dataset.test.y_enc)) == [0, 1] + assert ( + list(normalize(dataset.train.y.squeeze().unique())) + == list(normalize(dataset.test.y.squeeze().unique())) + == target_values + ) + assert ( + list(np.unique(dataset.train.y_enc)) + == list(np.unique(dataset.test.y_enc)) + == [0, 1] + ) @pytest.mark.use_disk @@ -90,7 +105,7 @@ def test_load_multiclass_task_csv(file_loader): ds_def = ns( train=os.path.join(res, "iris_train.csv"), test=os.path.join(res, "iris_test.csv"), - target="class" + target="class", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.multiclass @@ -105,10 +120,12 @@ def test_load_multiclass_task_with_num_target_no_type_csv(file_loader): ds_def = ns( train=os.path.join(res, "iris_num_train.csv"), test=os.path.join(res, "iris_num_test.csv"), - target="class" + target="class", ) ds = file_loader.load(ds_def) - assert ds.type is DatasetType.regression, "file loader should detect num target as regression by default" + assert ( + ds.type is DatasetType.regression + ), "file loader should detect num target as regression by default" @pytest.mark.use_disk @@ -117,7 +134,7 @@ def test_load_multiclass_task_with_num_target_csv(file_loader): train=os.path.join(res, "iris_num_train.csv"), test=os.path.join(res, "iris_num_test.csv"), target="class", - type="multiclass" + type="multiclass", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.multiclass @@ -132,7 +149,7 @@ def test_load_multiclass_task_arff(file_loader): ds_def = ns( train=os.path.join(res, "iris_train.arff"), test=os.path.join(res, "iris_test.arff"), - target="class" + target="class", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.multiclass @@ -145,23 +162,43 @@ def test_load_multiclass_task_arff(file_loader): def _assert_iris_features(dataset, definition, num_target=False): assert len(dataset.features) == 5 assert len(dataset.predictors) == 4 - target_values = ["1", "2", "3"] if num_target else ["iris-setosa", "iris-versicolor", "iris-virginica"] # values are normalized + target_values = ( + ["1", "2", "3"] + if num_target + else ["iris-setosa", "iris-versicolor", "iris-virginica"] + ) # values are normalized _assert_target(dataset.target, name=definition.target, values=target_values) - assert all([p.data_type in ['int', 'float'] for p in dataset.predictors]) + assert all([p.data_type in ["int", "float"] for p in dataset.predictors]) assert all([p.values is None for p in dataset.predictors]) assert not any([p.is_target for p in dataset.predictors]) assert not any([p.has_missing_values for p in dataset.predictors]) - floats = [p.name for p in dataset.predictors if p.data_type == 'float'] - ints = [p.name for p in dataset.predictors if p.data_type == 'int'] - assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all() - assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all() + floats = [p.name for p in dataset.predictors if p.data_type == "float"] + ints = [p.name for p in dataset.predictors if p.data_type == "int"] + assert ( + dataset.train.X.dtypes.filter(items=floats) + .apply(lambda dt: pd.api.types.is_float_dtype(dt)) + .all() + ) + assert ( + dataset.train.X.dtypes.filter(items=ints) + .apply(lambda dt: pd.api.types.is_integer_dtype(dt)) + .all() + ) assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0]) normalize = dataset.target.normalize - assert list(normalize(dataset.train.y.squeeze().unique())) == list(normalize(dataset.test.y.squeeze().unique())) == target_values - assert list(np.unique(dataset.train.y_enc)) == list(np.unique(dataset.test.y_enc)) == [0, 1, 2] + assert ( + list(normalize(dataset.train.y.squeeze().unique())) + == list(normalize(dataset.test.y.squeeze().unique())) + == target_values + ) + assert ( + list(np.unique(dataset.train.y_enc)) + == list(np.unique(dataset.test.y_enc)) + == [0, 1, 2] + ) @pytest.mark.use_disk @@ -169,7 +206,7 @@ def test_load_regression_task_csv(file_loader): ds_def = ns( train=os.path.join(res, "cholesterol_train.csv"), test=os.path.join(res, "cholesterol_test.csv"), - target="chol" + target="chol", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.regression @@ -177,7 +214,7 @@ def test_load_regression_task_csv(file_loader): _assert_X_y_types(ds.train) _assert_data_consistency(ds) _assert_data_paths(ds, ds_def) - _assert_cholesterol_features(ds, ds_def, 'csv') + _assert_cholesterol_features(ds, ds_def, "csv") @pytest.mark.use_disk @@ -185,7 +222,7 @@ def test_load_regression_task_arff(file_loader): ds_def = ns( train=os.path.join(res, "cholesterol_train.arff"), test=os.path.join(res, "cholesterol_test.arff"), - target="chol" + target="chol", ) ds = file_loader.load(ds_def) assert ds.type is DatasetType.regression @@ -193,7 +230,7 @@ def test_load_regression_task_arff(file_loader): _assert_X_y_types(ds.train) _assert_data_consistency(ds) _assert_data_paths(ds, ds_def) - _assert_cholesterol_features(ds, ds_def, 'arff') + _assert_cholesterol_features(ds, ds_def, "arff") def _assert_cholesterol_features(dataset, definition, fmt): @@ -201,29 +238,45 @@ def _assert_cholesterol_features(dataset, definition, fmt): assert len(dataset.predictors) == 13 _assert_target(dataset.target, name=definition.target) - ints = [p.name for p in dataset.predictors if p.data_type == 'int'] - floats = [p.name for p in dataset.predictors if p.data_type == 'float'] - categoricals = [p.name for p in dataset.predictors if p.data_type == 'category'] + ints = [p.name for p in dataset.predictors if p.data_type == "int"] + floats = [p.name for p in dataset.predictors if p.data_type == "float"] + categoricals = [p.name for p in dataset.predictors if p.data_type == "category"] - assert len(ints) == (0 if fmt == 'arff' else 6) - assert len(floats) == (6 if fmt == 'arff' else 7) - assert len(categoricals) == (7 if fmt == 'arff' else 0) + assert len(ints) == (0 if fmt == "arff" else 6) + assert len(floats) == (6 if fmt == "arff" else 7) + assert len(categoricals) == (7 if fmt == "arff" else 0) assert not any([p.is_target for p in dataset.predictors]) assert len([p for p in dataset.predictors if p.has_missing_values]) == 2 - assert dataset.train.X.dtypes.filter(items=ints).apply(lambda dt: pd.api.types.is_integer_dtype(dt)).all() - assert dataset.train.X.dtypes.filter(items=floats).apply(lambda dt: pd.api.types.is_float_dtype(dt)).all() - assert dataset.train.X.dtypes.filter(items=categoricals).apply(lambda dt: pd.api.types.is_categorical_dtype(dt)).all() + assert ( + dataset.train.X.dtypes.filter(items=ints) + .apply(lambda dt: pd.api.types.is_integer_dtype(dt)) + .all() + ) + assert ( + dataset.train.X.dtypes.filter(items=floats) + .apply(lambda dt: pd.api.types.is_float_dtype(dt)) + .all() + ) + assert ( + dataset.train.X.dtypes.filter(items=categoricals) + .apply(lambda dt: pd.api.types.is_categorical_dtype(dt)) + .all() + ) assert pd.api.types.is_float_dtype(dataset.train.y.dtypes.iloc[0]) - assert np.array_equal(dataset.train.y_enc, dataset.train.y.squeeze().to_numpy()), "no encoding should have been applied on regression target" - assert np.array_equal(dataset.test.y_enc, dataset.test.y.squeeze().to_numpy()), "no encoding should have been applied on regression target" + assert np.array_equal( + dataset.train.y_enc, dataset.train.y.squeeze().to_numpy() + ), "no encoding should have been applied on regression target" + assert np.array_equal( + dataset.test.y_enc, dataset.test.y.squeeze().to_numpy() + ), "no encoding should have been applied on regression target" def _assert_target(target, name, values=None): assert target.name == name assert target.values == values - assert target.data_type == 'category' if values else 'float' + assert target.data_type == "category" if values else "float" assert target.is_target assert not target.has_missing_values @@ -233,7 +286,7 @@ def _assert_data_paths(dataset, definition): assert dataset.test.path == definition.test sp = split_path(definition.train) fmt = sp.extension[1:] - for f in ['arff', 'csv', 'parquet']: + for f in ["arff", "csv", "parquet"]: if f == fmt: assert dataset.train.data_path(f) == dataset.train.path else: @@ -259,15 +312,15 @@ def _assert_data_consistency(dataset, check_encoded=True): assert not any([p.is_target for p in dataset.predictors]) - assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes) assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes) if check_encoded: assert dataset.train.X_enc.shape == dataset.train.X.shape assert np.issubdtype(dataset.train.X_enc.dtype, np.floating) - assert np.issubdtype(dataset.train.y_enc.dtype, np.floating) # not ideal given that it's also for classification targets, but well… - + assert np.issubdtype( + dataset.train.y_enc.dtype, np.floating + ) # not ideal given that it's also for classification targets, but well… @pytest.mark.use_disk @@ -297,13 +350,17 @@ def test_load_timeseries_task_csv(file_loader): assert pat.is_float_dtype(ds._dtypes[ds.target.name]) # timeseries uses different task schema - set attributes for test to work - ds_def['train'] = ds.train.path - ds_def['test'] = ds.test.path + ds_def["train"] = ds.train.path + ds_def["test"] = ds.test.path _assert_data_paths(ds, ds_def) -@pytest.mark.parametrize("missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"]) -def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loader, missing_key): +@pytest.mark.parametrize( + "missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"] +) +def test_when_timeseries_task_key_is_missing_then_exception_is_raised( + file_loader, missing_key +): task_kwargs = dict( path=os.path.join(res, "m4_hourly_subset.csv"), forecast_horizon_in_steps=24, @@ -314,12 +371,17 @@ def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loade ) task_kwargs.pop(missing_key) ds_def = ns.from_dict(task_kwargs) - with pytest.raises(AssertionError, match=f"Task definition for timeseries must include `{missing_key}`"): + with pytest.raises( + AssertionError, + match=f"Task definition for timeseries must include `{missing_key}`", + ): file_loader.load(ds_def) @pytest.mark.parametrize("missing_key", ["id_column", "timestamp_column"]) -def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised(file_loader, missing_key): +def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised( + file_loader, missing_key +): task_kwargs = dict( path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"), forecast_horizon_in_steps=24, @@ -336,7 +398,9 @@ def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_rai file_loader.load(ds_def) -def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(file_loader): +def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded( + file_loader, +): task_kwargs = dict( path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"), forecast_horizon_in_steps=24, @@ -353,7 +417,9 @@ def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(fil @pytest.mark.parametrize("forecast_horizon, fold", [(50, 2), (100, 0), (10, 9)]) -def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised(file_loader, forecast_horizon, fold): +def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised( + file_loader, forecast_horizon, fold +): ds_def = ns( path=os.path.join(res, "m4_hourly_subset.csv"), forecast_horizon_in_steps=forecast_horizon, @@ -361,5 +427,7 @@ def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_ra freq="H", type="timeseries", ) - with pytest.raises(ValueError, match="All time series in the dataset must have length"): + with pytest.raises( + ValueError, match="All time series in the dataset must have length" + ): file_loader.load(ds_def, fold=fold) diff --git a/tests/unit/amlb/datasets/openml/test_openml_dataloader.py b/tests/unit/amlb/datasets/openml/test_openml_dataloader.py index 3c496d462..c3340f9a9 100644 --- a/tests/unit/amlb/datasets/openml/test_openml_dataloader.py +++ b/tests/unit/amlb/datasets/openml/test_openml_dataloader.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from amlb.resources import from_config +from amlb.resources import from_configs from amlb.data import DatasetType from amlb.datasets.openml import OpenmlLoader from amlb.utils import Namespace as ns @@ -13,17 +13,13 @@ @pytest.fixture def oml_config(): - return from_config( + return from_configs( ns( input_dir="my_input", output_dir="my_output", user_dir="my_user_dir", root_dir="my_root_dir", - - openml=ns( - apikey="c1994bdb7ecb3c6f3c8f3b35f4b47f1f", - infer_dtypes=False - ) + openml=ns(apikey="c1994bdb7ecb3c6f3c8f3b35f4b47f1f", infer_dtypes=False), ) ).config @@ -51,13 +47,15 @@ def _assert_kc2_features(dataset): _assert_target(dataset.target, "problems", ["no", "yes"]) - assert all([p.data_type == 'number' for p in dataset.predictors]) + assert all([p.data_type == "number" for p in dataset.predictors]) assert all([p.values is None for p in dataset.predictors]) assert not any([p.has_missing_values for p in dataset.predictors]) - assert dataset.train.X.dtypes.apply(lambda dt: pd.api.types.is_numeric_dtype(dt)).all() - assert len(dataset.train.X.select_dtypes(include=['float']).columns) == 18 - assert len(dataset.train.X.select_dtypes(include=['uint8']).columns) == 3 + assert dataset.train.X.dtypes.apply( + lambda dt: pd.api.types.is_numeric_dtype(dt) + ).all() + assert len(dataset.train.X.select_dtypes(include=["float"]).columns) == 18 + assert len(dataset.train.X.select_dtypes(include=["uint8"]).columns) == 3 assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0]) @@ -77,13 +75,17 @@ def _assert_iris_features(dataset): assert len(dataset.features) == 5 assert len(dataset.predictors) == 4 - _assert_target(dataset.target, "class", ["iris-setosa", "iris-versicolor", "iris-virginica"]) + _assert_target( + dataset.target, "class", ["iris-setosa", "iris-versicolor", "iris-virginica"] + ) - assert all([p.data_type == 'number' for p in dataset.predictors]) + assert all([p.data_type == "number" for p in dataset.predictors]) assert all([p.values is None for p in dataset.predictors]) assert not any([p.has_missing_values for p in dataset.predictors]) - assert dataset.train.X.dtypes.apply(lambda dt: pd.api.types.is_float_dtype(dt)).all() + assert dataset.train.X.dtypes.apply( + lambda dt: pd.api.types.is_float_dtype(dt) + ).all() assert pd.api.types.is_categorical_dtype(dataset.train.y.dtypes.iloc[0]) @@ -105,33 +107,53 @@ def _assert_cholesterol_features(dataset): _assert_target(dataset.target, "chol") - numericals = [p.name for p in dataset.predictors if p.data_type == 'number'] - categoricals = [p.name for p in dataset.predictors if p.data_type == 'category'] + numericals = [p.name for p in dataset.predictors if p.data_type == "number"] + categoricals = [p.name for p in dataset.predictors if p.data_type == "category"] assert len(numericals) == 6 assert len(categoricals) == 7 assert len([p for p in dataset.predictors if p.has_missing_values]) == 2 - assert dataset.train.X.dtypes.filter(items=numericals).apply(lambda dt: pd.api.types.is_numeric_dtype(dt)).all() - assert dataset.train.X.dtypes.filter(items=categoricals).apply(lambda dt: pd.api.types.is_categorical_dtype(dt)).all() - assert len(dataset.train.X.select_dtypes(include=['float']).columns) == 6 + assert ( + dataset.train.X.dtypes.filter(items=numericals) + .apply(lambda dt: pd.api.types.is_numeric_dtype(dt)) + .all() + ) + assert ( + dataset.train.X.dtypes.filter(items=categoricals) + .apply(lambda dt: pd.api.types.is_categorical_dtype(dt)) + .all() + ) + assert len(dataset.train.X.select_dtypes(include=["float"]).columns) == 6 assert pd.api.types.is_float_dtype(dataset.train.y.dtypes.iloc[0]) def _assert_target(target, name, values=None): assert target.name == name assert target.values == values - assert target.data_type == 'category' if values else 'number' + assert target.data_type == "category" if values else "number" assert target.is_target assert not target.has_missing_values def _assert_data_paths(dataset, ds_id, fold): - assert dataset.train.path.endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.arff")) - assert dataset.test.path.endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.arff")) - assert dataset.train.data_path('csv').endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.csv")) - assert dataset.test.data_path('csv').endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.csv")) - assert dataset.train.data_path('parquet').endswith(os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.parquet")) - assert dataset.test.data_path('parquet').endswith(os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.parquet")) + assert dataset.train.path.endswith( + os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.arff") + ) + assert dataset.test.path.endswith( + os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.arff") + ) + assert dataset.train.data_path("csv").endswith( + os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.csv") + ) + assert dataset.test.data_path("csv").endswith( + os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.csv") + ) + assert dataset.train.data_path("parquet").endswith( + os.path.join("datasets", str(ds_id), f"dataset_train_{fold}.parquet") + ) + assert dataset.test.data_path("parquet").endswith( + os.path.join("datasets", str(ds_id), f"dataset_test_{fold}.parquet") + ) def _assert_X_y_types(data_split): @@ -155,6 +177,7 @@ def _assert_data_consistency(dataset): assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes) assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes) - assert np.issubdtype(dataset.train.X_enc.dtype, np.floating) # all categorical features are directly encoded as float + assert np.issubdtype( + dataset.train.X_enc.dtype, np.floating + ) # all categorical features are directly encoded as float assert np.issubdtype(dataset.train.y_enc.dtype, np.floating) -