Skip to content

Commit

Permalink
Update sklearn compatibility for dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
fraimondo committed Nov 19, 2024
1 parent 418b8a5 commit 44edd92
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 40 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,6 @@ cython_debug/
# OS Stuff
.DS_store

opnmf/_version.py

scratch/
71 changes: 47 additions & 24 deletions opnmf/model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.base import (
BaseEstimator,
ClassNamePrefixFeaturesOutMixin,
TransformerMixin,
)
from sklearn.utils.validation import check_is_fitted

from . opnmf import opnmf
from . selection import rank_permute
from . logging import logger
from .logging import logger
from .opnmf import opnmf
from .selection import rank_permute


class OPNMF(TransformerMixin, BaseEstimator):
""" orthogonal projective non-negative matrix factorization
class OPNMF(TransformerMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin):
"""orthogonal projective non-negative matrix factorization
Parameters
----------
Expand All @@ -21,7 +26,7 @@ class OPNMF(TransformerMixin, BaseEstimator):
Maximum number of iterations before timing out. Defaults to 200.
tol: float, default=1e-4
Tolerance of the stopping condition.
init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}
Method used to initialize the procedure.
Valid options:
Expand All @@ -37,17 +42,18 @@ class OPNMF(TransformerMixin, BaseEstimator):
for when sparsity is not desired)
* 'custom': use custom matrix W.
Default is None
"""

def __init__(self, n_components=10, max_iter=50000, tol=1e-5,
init='nndsvd'):
def __init__(self, n_components=10, max_iter=50000, tol=1e-5, init="nndsvd"):
self.n_components = n_components
self.max_iter = max_iter
self.tol = tol
self.init = init

def fit(self, X, init_W=None):
""" Learn a OPNMF model for the data X.
def fit(self, X, y=None, init_W=None):
"""Learn a OPNMF model for the data X.
Parameters
----------
Expand All @@ -63,8 +69,8 @@ def fit(self, X, init_W=None):
self.fit_transform(X, init_W=init_W)
return self

def fit_transform(self, X, init_W=None):
""" Learn a OPNMF model for the data X and returns the transformed
def fit_transform(self, X, y=None, init_W=None):
"""Learn a OPNMF model for the data X and returns the transformed
data.
Parameters
Expand All @@ -80,10 +86,10 @@ def fit_transform(self, X, init_W=None):
Transformed data
"""

if self.n_components == 'auto' or isinstance(self.n_components, range):
logger.info('Doing rank selection')
if self.n_components == 'auto':
logger.info('Determining number of components automatically')
if self.n_components == "auto" or isinstance(self.n_components, range):
logger.info("Doing rank selection")
if self.n_components == "auto":
logger.info("Determining number of components automatically")
min_components = 1
max_components = X.shape[0]
step = 1
Expand All @@ -92,9 +98,15 @@ def fit_transform(self, X, init_W=None):
max_components = range.stop
step = range.step
out = rank_permute(
X, min_components, max_components, step=step,
max_iter=self.max_iter, tolerance=self.tol, init=self.init,
init_W=init_W)
X,
min_components,
max_components,
step=step,
max_iter=self.max_iter,
tolerance=self.tol,
init=self.init,
init_W=init_W,
)
good_ranks, ranks, errors, random_errors, estimators = out
chosen = estimators[good_ranks[0] - 1]
W = chosen.coef_
Expand All @@ -105,13 +117,19 @@ def fit_transform(self, X, init_W=None):
self.random_errors_ = random_errors
self.good_ranks_ = good_ranks
elif not np.issubdtype(type(self.n_components), int):
raise ValueError('Do not know how to factorize to '
f'{self.n_components} components')
raise ValueError(
"Do not know how to factorize to " f"{self.n_components} components"
)
else:
# Run factorization
W, H, mse = opnmf(
X, n_components=self.n_components, max_iter=self.max_iter,
tol=self.tol, init=self.init, init_W=init_W)
X.values if isinstance(X, pd.DataFrame) else X,
n_components=self.n_components,
max_iter=self.max_iter,
tol=self.tol,
init=self.init,
init_W=init_W,
)

# Set model variables
self.coef_ = W
Expand Down Expand Up @@ -139,3 +157,8 @@ def transform(self, X):
def mse(self):
check_is_fitted(self)
return self.mse_

@property
def _n_features_out(self):
"""Number of transformed output features."""
return self.n_components_
37 changes: 21 additions & 16 deletions opnmf/opnmf.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import warnings

import numpy as np

from sklearn.decomposition._nmf import _initialize_nmf

from . logging import logger, warn
from .logging import logger, warn


def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
init_W=None):
def opnmf(X, n_components, max_iter=50000, tol=1e-5, init="nndsvd", init_W=None):
"""
Orthogonal projective non-negative matrix factorization.
Expand Down Expand Up @@ -49,11 +49,13 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
mse : float
Reconstruction error
"""
if init != 'custom':
if init != "custom":
if init_W is not None:
warn('Initialisation was not set to "custom" but an initial W '
'matrix was specified. This matrix will be ignored.')
logger.info(f'Initializing using {init}')
warn(
'Initialisation was not set to "custom" but an initial W '
"matrix was specified. This matrix will be ignored."
)
logger.info(f"Initializing using {init}")
W, _ = _initialize_nmf(X, n_components, init=init)
init_W = None
else:
Expand All @@ -73,25 +75,28 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
W[W < 1e-16] = 1e-16
W = W / np.linalg.norm(W, ord=2)

delta_W = (np.linalg.norm(old_W - W, ord='fro') /
np.linalg.norm(old_W, ord='fro'))
delta_W = np.linalg.norm(old_W - W, ord="fro") / np.linalg.norm(
old_W, ord="fro"
)
if (iter % 100) == 0:
obj = np.linalg.norm(X - (W @ (W.T @ X)), ord='fro')
logger.info(f'iter={iter} diff={delta_W}, obj={obj}')
obj = np.linalg.norm(X - (W @ (W.T @ X)), ord="fro")
logger.info(f"iter={iter} diff={delta_W}, obj={obj}")
if delta_W < tol:
logger.info(f'Converged in {iter} iterations')
logger.info(f"Converged in {iter} iterations")
break

if delta_W > tol:
warn('OPNMF did not converge with '
f'tolerance = {tol} under {max_iter} iterations')
warn(
"OPNMF did not converge with "
f"tolerance = {tol} under {max_iter} iterations"
)

H = W.T @ X

hlen = np.linalg.norm(H, ord=2, axis=1)
n_zero = np.sum(hlen == 0)
if n_zero > 0:
warnings.warn(f'low rank: {n_zero} factors have norm 0')
warnings.warn(f"low rank: {n_zero} factors have norm 0")
hlen[hlen == 0] = 1

Wh = W * hlen
Expand All @@ -100,6 +105,6 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
W = W[:, idx]
H = W.T @ X

mse = np.linalg.norm(X - (W @ H), ord='fro')
mse = np.linalg.norm(X - (W @ H), ord="fro")

return W, H, mse

0 comments on commit 44edd92

Please sign in to comment.