From 44edd923ce07bec791126ff84bd0f7105544d3cf Mon Sep 17 00:00:00 2001
From: Fede Raimondo <f.raimondo@fz-juelich.de>
Date: Tue, 19 Nov 2024 09:31:15 +0100
Subject: [PATCH] Update sklearn compatibility for dataframes

---
 .gitignore     |  2 ++
 opnmf/model.py | 71 +++++++++++++++++++++++++++++++++-----------------
 opnmf/opnmf.py | 37 ++++++++++++++------------
 3 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6023f51..9460e12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -132,4 +132,6 @@ cython_debug/
 # OS Stuff
 .DS_store
 
+opnmf/_version.py
+
 scratch/
\ No newline at end of file
diff --git a/opnmf/model.py b/opnmf/model.py
index f57c6bd..f20e133 100644
--- a/opnmf/model.py
+++ b/opnmf/model.py
@@ -1,14 +1,19 @@
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
+import pandas as pd
+from sklearn.base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+)
 from sklearn.utils.validation import check_is_fitted
 
-from . opnmf import opnmf
-from . selection import rank_permute
-from . logging import logger
+from .logging import logger
+from .opnmf import opnmf
+from .selection import rank_permute
 
 
-class OPNMF(TransformerMixin, BaseEstimator):
-    """ orthogonal projective non-negative matrix factorization
+class OPNMF(TransformerMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin):
+    """orthogonal projective non-negative matrix factorization
 
     Parameters
     ----------
@@ -21,7 +26,7 @@ class OPNMF(TransformerMixin, BaseEstimator):
         Maximum number of iterations before timing out. Defaults to 200.
     tol: float, default=1e-4
         Tolerance of the stopping condition.
-    init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}, default=None
+    init : {'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'}
         Method used to initialize the procedure.
         Valid options:
 
@@ -37,17 +42,18 @@ class OPNMF(TransformerMixin, BaseEstimator):
           for when sparsity is not desired)
         * 'custom': use custom matrix W.
 
+        Default is None
+
     """
 
-    def __init__(self, n_components=10, max_iter=50000, tol=1e-5,
-                 init='nndsvd'):
+    def __init__(self, n_components=10, max_iter=50000, tol=1e-5, init="nndsvd"):
         self.n_components = n_components
         self.max_iter = max_iter
         self.tol = tol
         self.init = init
 
-    def fit(self, X, init_W=None):
-        """ Learn a OPNMF model for the data X.
+    def fit(self, X, y=None, init_W=None):
+        """Learn a OPNMF model for the data X.
 
         Parameters
         ----------
@@ -63,8 +69,8 @@ def fit(self, X, init_W=None):
         self.fit_transform(X, init_W=init_W)
         return self
 
-    def fit_transform(self, X, init_W=None):
-        """ Learn a OPNMF model for the data X and returns the transformed
+    def fit_transform(self, X, y=None, init_W=None):
+        """Learn a OPNMF model for the data X and returns the transformed
         data.
 
         Parameters
@@ -80,10 +86,10 @@ def fit_transform(self, X, init_W=None):
             Transformed data
         """
 
-        if self.n_components == 'auto' or isinstance(self.n_components, range):
-            logger.info('Doing rank selection')
-            if self.n_components == 'auto':
-                logger.info('Determining number of components automatically')
+        if self.n_components == "auto" or isinstance(self.n_components, range):
+            logger.info("Doing rank selection")
+            if self.n_components == "auto":
+                logger.info("Determining number of components automatically")
                 min_components = 1
                 max_components = X.shape[0]
                 step = 1
@@ -92,9 +98,15 @@ def fit_transform(self, X, init_W=None):
                 max_components = range.stop
                 step = range.step
             out = rank_permute(
-                X, min_components, max_components, step=step,
-                max_iter=self.max_iter, tolerance=self.tol, init=self.init,
-                init_W=init_W)
+                X,
+                min_components,
+                max_components,
+                step=step,
+                max_iter=self.max_iter,
+                tolerance=self.tol,
+                init=self.init,
+                init_W=init_W,
+            )
             good_ranks, ranks, errors, random_errors, estimators = out
             chosen = estimators[good_ranks[0] - 1]
             W = chosen.coef_
@@ -105,13 +117,19 @@ def fit_transform(self, X, init_W=None):
             self.random_errors_ = random_errors
             self.good_ranks_ = good_ranks
         elif not np.issubdtype(type(self.n_components), int):
-            raise ValueError('Do not know how to factorize to '
-                             f'{self.n_components} components')
+            raise ValueError(
+                "Do not know how to factorize to " f"{self.n_components} components"
+            )
         else:
             # Run factorization
             W, H, mse = opnmf(
-                X, n_components=self.n_components, max_iter=self.max_iter,
-                tol=self.tol, init=self.init, init_W=init_W)
+                X.values if isinstance(X, pd.DataFrame) else X,
+                n_components=self.n_components,
+                max_iter=self.max_iter,
+                tol=self.tol,
+                init=self.init,
+                init_W=init_W,
+            )
 
         # Set model variables
         self.coef_ = W
@@ -139,3 +157,8 @@ def transform(self, X):
     def mse(self):
         check_is_fitted(self)
         return self.mse_
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.n_components_
diff --git a/opnmf/opnmf.py b/opnmf/opnmf.py
index 8743952..c4f2ff0 100644
--- a/opnmf/opnmf.py
+++ b/opnmf/opnmf.py
@@ -1,13 +1,13 @@
 import warnings
+
 import numpy as np
 
 from sklearn.decomposition._nmf import _initialize_nmf
 
-from . logging import logger, warn
+from .logging import logger, warn
 
 
-def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
-          init_W=None):
+def opnmf(X, n_components, max_iter=50000, tol=1e-5, init="nndsvd", init_W=None):
     """
     Orthogonal projective non-negative matrix factorization.
 
@@ -49,11 +49,13 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
     mse : float
         Reconstruction error
     """
-    if init != 'custom':
+    if init != "custom":
         if init_W is not None:
-            warn('Initialisation was not set to "custom" but an initial W '
-                 'matrix was specified. This matrix will be ignored.')
-        logger.info(f'Initializing using {init}')
+            warn(
+                'Initialisation was not set to "custom" but an initial W '
+                "matrix was specified. This matrix will be ignored."
+            )
+        logger.info(f"Initializing using {init}")
         W, _ = _initialize_nmf(X, n_components, init=init)
         init_W = None
     else:
@@ -73,25 +75,28 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
             W[W < 1e-16] = 1e-16
             W = W / np.linalg.norm(W, ord=2)
 
-            delta_W = (np.linalg.norm(old_W - W, ord='fro') /
-                       np.linalg.norm(old_W, ord='fro'))
+            delta_W = np.linalg.norm(old_W - W, ord="fro") / np.linalg.norm(
+                old_W, ord="fro"
+            )
             if (iter % 100) == 0:
-                obj = np.linalg.norm(X - (W @ (W.T @ X)), ord='fro')
-                logger.info(f'iter={iter} diff={delta_W}, obj={obj}')
+                obj = np.linalg.norm(X - (W @ (W.T @ X)), ord="fro")
+                logger.info(f"iter={iter} diff={delta_W}, obj={obj}")
             if delta_W < tol:
-                logger.info(f'Converged in {iter} iterations')
+                logger.info(f"Converged in {iter} iterations")
                 break
 
     if delta_W > tol:
-        warn('OPNMF did not converge with '
-             f'tolerance = {tol} under {max_iter} iterations')
+        warn(
+            "OPNMF did not converge with "
+            f"tolerance = {tol} under {max_iter} iterations"
+        )
 
     H = W.T @ X
 
     hlen = np.linalg.norm(H, ord=2, axis=1)
     n_zero = np.sum(hlen == 0)
     if n_zero > 0:
-        warnings.warn(f'low rank: {n_zero} factors have norm 0')
+        warnings.warn(f"low rank: {n_zero} factors have norm 0")
         hlen[hlen == 0] = 1
 
     Wh = W * hlen
@@ -100,6 +105,6 @@ def opnmf(X, n_components, max_iter=50000, tol=1e-5, init='nndsvd',
     W = W[:, idx]
     H = W.T @ X
 
-    mse = np.linalg.norm(X - (W @ H), ord='fro')
+    mse = np.linalg.norm(X - (W @ H), ord="fro")
 
     return W, H, mse