From 44edd923ce07bec791126ff84bd0f7105544d3cf Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Tue, 19 Nov 2024 09:31:15 +0100 Subject: [PATCH] Update sklearn compatibility for dataframes --- .gitignore | 2 ++ opnmf/model.py | 71 +++++++++++++++++++++++++++++++++----------------- opnmf/opnmf.py | 37 ++++++++++++++------------ 3 files changed, 70 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 6023f51..9460e12 100644 --- a/.gitignore +++ b/.gitignore @@ -132,4 +132,6 @@ cython_debug/ # OS Stuff .DS_store +opnmf/_version.py + scratch/ \ No newline at end of file diff --git a/opnmf/model.py b/opnmf/model.py index f57c6bd..f20e133 100644 --- a/opnmf/model.py +++ b/opnmf/model.py @@ -1,14 +1,19 @@ import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin +import pandas as pd +from sklearn.base import ( + BaseEstimator, + ClassNamePrefixFeaturesOutMixin, + TransformerMixin, +) from sklearn.utils.validation import check_is_fitted -from . opnmf import opnmf -from . selection import rank_permute -from . logging import logger +from .logging import logger +from .opnmf import opnmf +from .selection import rank_permute -class OPNMF(TransformerMixin, BaseEstimator): - """ orthogonal projective non-negative matrix factorization +class OPNMF(TransformerMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin): + """orthogonal projective non-negative matrix factorization Parameters ---------- @@ -21,7 +26,7 @@ class OPNMF(TransformerMixin, BaseEstimator): Maximum number of iterations before timing out. Defaults to 200. tol: float, default=1e-4 Tolerance of the stopping condition. - init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None + init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'} Method used to initialize the procedure. Valid options: @@ -37,17 +42,18 @@ class OPNMF(TransformerMixin, BaseEstimator): for when sparsity is not desired) * 'custom': use custom matrix W. + Default is None + """ - def __init__(self, n_components=10, max_iter=50000, tol=1e-5, - init='nndsvd'): + def __init__(self, n_components=10, max_iter=50000, tol=1e-5, init="nndsvd"): self.n_components = n_components self.max_iter = max_iter self.tol = tol self.init = init - def fit(self, X, init_W=None): - """ Learn a OPNMF model for the data X. + def fit(self, X, y=None, init_W=None): + """Learn a OPNMF model for the data X. Parameters ---------- @@ -63,8 +69,8 @@ def fit(self, X, init_W=None): self.fit_transform(X, init_W=init_W) return self - def fit_transform(self, X, init_W=None): - """ Learn a OPNMF model for the data X and returns the transformed + def fit_transform(self, X, y=None, init_W=None): + """Learn a OPNMF model for the data X and returns the transformed data. Parameters @@ -80,10 +86,10 @@ def fit_transform(self, X, init_W=None): Transformed data """ - if self.n_components == 'auto' or isinstance(self.n_components, range): - logger.info('Doing rank selection') - if self.n_components == 'auto': - logger.info('Determining number of components automatically') + if self.n_components == "auto" or isinstance(self.n_components, range): + logger.info("Doing rank selection") + if self.n_components == "auto": + logger.info("Determining number of components automatically") min_components = 1 max_components = X.shape[0] step = 1 @@ -92,9 +98,15 @@ def fit_transform(self, X, init_W=None): max_components = range.stop step = range.step out = rank_permute( - X, min_components, max_components, step=step, - max_iter=self.max_iter, tolerance=self.tol, init=self.init, - init_W=init_W) + X, + min_components, + max_components, + step=step, + max_iter=self.max_iter, + tolerance=self.tol, + init=self.init, + init_W=init_W, + ) good_ranks, ranks, errors, random_errors, estimators = out chosen = estimators[good_ranks[0] - 1] W = chosen.coef_ @@ -105,13 +117,19 @@ def fit_transform(self, X, init_W=None): self.random_errors_ = random_errors self.good_ranks_ = good_ranks elif not np.issubdtype(type(self.n_components), int): - raise ValueError('Do not know how to factorize to ' - f'{self.n_components} components') + raise ValueError( + "Do not know how to factorize to " f"{self.n_components} components" + ) else: # Run factorization W, H, mse = opnmf( - X, n_components=self.n_components, max_iter=self.max_iter, - tol=self.tol, init=self.init, init_W=init_W) + X.values if isinstance(X, pd.DataFrame) else X, + n_components=self.n_components, + max_iter=self.max_iter, + tol=self.tol, + init=self.init, + init_W=init_W, + ) # Set model variables self.coef_ = W @@ -139,3 +157,8 @@ def transform(self, X): def mse(self): check_is_fitted(self) return self.mse_ + + @property + def _n_features_out(self): + """Number of transformed output features.""" + return self.n_components_ diff --git a/opnmf/opnmf.py b/opnmf/opnmf.py index 8743952..c4f2ff0 100644 --- a/opnmf/opnmf.py +++ b/opnmf/opnmf.py @@ -1,13 +1,13 @@ import warnings + import numpy as np from sklearn.decomposition._nmf import _initialize_nmf -from . logging import logger, warn +from .logging import logger, warn -def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd', - init_W=None): +def opnmf(X, n_components, max_iter=50000, tol=1e-5, init="nndsvd", init_W=None): """ Orthogonal projective non-negative matrix factorization. @@ -49,11 +49,13 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd', mse : float Reconstruction error """ - if init != 'custom': + if init != "custom": if init_W is not None: - warn('Initialisation was not set to "custom" but an initial W ' - 'matrix was specified. This matrix will be ignored.') - logger.info(f'Initializing using {init}') + warn( + 'Initialisation was not set to "custom" but an initial W ' + "matrix was specified. This matrix will be ignored." + ) + logger.info(f"Initializing using {init}") W, _ = _initialize_nmf(X, n_components, init=init) init_W = None else: @@ -73,25 +75,28 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd', W[W < 1e-16] = 1e-16 W = W / np.linalg.norm(W, ord=2) - delta_W = (np.linalg.norm(old_W - W, ord='fro') / - np.linalg.norm(old_W, ord='fro')) + delta_W = np.linalg.norm(old_W - W, ord="fro") / np.linalg.norm( + old_W, ord="fro" + ) if (iter % 100) == 0: - obj = np.linalg.norm(X - (W @ (W.T @ X)), ord='fro') - logger.info(f'iter={iter} diff={delta_W}, obj={obj}') + obj = np.linalg.norm(X - (W @ (W.T @ X)), ord="fro") + logger.info(f"iter={iter} diff={delta_W}, obj={obj}") if delta_W < tol: - logger.info(f'Converged in {iter} iterations') + logger.info(f"Converged in {iter} iterations") break if delta_W > tol: - warn('OPNMF did not converge with ' - f'tolerance = {tol} under {max_iter} iterations') + warn( + "OPNMF did not converge with " + f"tolerance = {tol} under {max_iter} iterations" + ) H = W.T @ X hlen = np.linalg.norm(H, ord=2, axis=1) n_zero = np.sum(hlen == 0) if n_zero > 0: - warnings.warn(f'low rank: {n_zero} factors have norm 0') + warnings.warn(f"low rank: {n_zero} factors have norm 0") hlen[hlen == 0] = 1 Wh = W * hlen @@ -100,6 +105,6 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd', W = W[:, idx] H = W.T @ X - mse = np.linalg.norm(X - (W @ H), ord='fro') + mse = np.linalg.norm(X - (W @ H), ord="fro") return W, H, mse