test and method hook up for skbio pass through metrics

qiime2 · May 1, 2024 · f6acf9f · f6acf9f
1 parent 9eeb6ff
commit f6acf9f
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 45 deletions.
diff --git a/q2_diversity_lib/alpha.py b/q2_diversity_lib/alpha.py
@@ -8,14 +8,15 @@
 
 import pandas as pd
 import numpy as np
+import functools
 
 import skbio.diversity
 from skbio.diversity._util import _validate_counts_vector
+import skbio.diversity.alpha
 
 from scipy.special import gammaln
 
 import biom
-import numpy as np
 
 from q2_types.feature_table import BIOMV210Format
 from q2_types.sample_data import AlphaDiversityFormat
@@ -69,6 +70,7 @@ def _skbio_alpha_diversity_from_1d(v, metric):
                                              counts=v,
                                              ids=['placeholder', ],
                                              validate=False)
+    print(result.iloc[0])
     return result.iloc[0]
 
 
@@ -97,17 +99,12 @@ def transform_(v, i, m):
 
     results = []
     for v in table.iter_data(dense=True):
-        results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
+        # using in-house metrics temporarily
+        # results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
+        v = np.reshape(v, (1, len(v)))
+        results.extend([_p_evenness(c)for c in v])
     results = pd.Series(results, index=table.ids(), name='pielou_evenness')
     return results
-# legacy code
-    for partition in _partition(table):
-        counts = partition.matrix_data.T.toarray()
-        sample_ids = partition.ids(axis='sample')
-        results = [_p_evenness(c, sample_ids)for c in counts]
-    result = pd.Series(results, index=sample_ids)
-    result.name = 'pielou_evenness'
-    return result
 
 
 @_validate_tables
@@ -118,29 +115,35 @@ def shannon_entropy(table: biom.Table,
 
     results = []
     for v in table.iter_data(dense=True):
-        results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
+        # using in-house metrics temporarily
+        # results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
+        v = np.reshape(v, (1, len(v)))
+        results.extend([_shannon(c)for c in v])
     results = pd.Series(results, index=table.ids(), name='shannon_entropy')
     return results
-#LEGACY
-    for partition in _partition(table):
-        counts = partition.matrix_data.T.toarray()
-        sample_ids = partition.ids(axis='sample')
-        # TODO replace with internal shannons method
-        results = [_shannon(c, sample_ids)for c in counts]
-    result = pd.Series(results, index=sample_ids)
-    result.name = 'shannon_entropy'
-    return result
 
 
 @_validate_tables
 def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
     results = []
-
-    for v in table.iter_data(dense=True):
-        results.append(_skbio_alpha_diversity_from_1d(v.astype(int), metric))
-        # TODO write if statements for hard coding the following metrics:
-        # berger-parker,brillion, simpsons D, etsy, goods, margalef's, mcIntosh,
-        # strongs
+    method_map = {"berger_parker_d": _berger_parker,
+                  "brillouin_d": _brillouin_d,
+                  "simpson": _simpsons_dominance,
+                  "esty_ci": _esty_ci,
+                  "goods_coverage": _goods_coverage,
+                  "margalef": _margalef,
+                  "mcintosh_d": _mcintosh_d,
+                  "strong": _strong}
+
+    if metric in method_map:
+        metric = functools.partial(method_map[metric])
+        for v in table.iter_data(dense=True):
+            v = np.reshape(v, (1, len(v)))
+            results.extend([metric(c)for c in v])
+    else:
+        for v in table.iter_data(dense=True):
+            results.append(_skbio_alpha_diversity_from_1d(v.astype(int),
+                                                          metric))
     results = pd.Series(results, index=table.ids(), name=metric)
     return results
 
@@ -160,15 +163,14 @@ def _brillouin_d(counts):
 
 def _simpsons_dominance(counts):
     counts = _validate_counts_vector(counts)
-    freqs = counts / counts.sum()
-    return (freqs * freqs).sum()
+    return 1 - skbio.diversity.alpha.dominance(counts)
 
 
-def _etsy_ci(counts):
+def _esty_ci(counts):
     counts = _validate_counts_vector(counts)
 
-    f1 = _singles(counts)
-    f2 = _doubles(counts)
+    f1 = skbio.diversity.alpha.singles(counts)
+    f2 = skbio.diversity.alpha.doubles(counts)
     n = counts.sum()
     z = 1.959963985
     W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)
@@ -178,7 +180,7 @@ def _etsy_ci(counts):
 
 def _goods_coverage(counts):
     counts = _validate_counts_vector(counts)
-    f1 = _singles(counts)
+    f1 = skbio.diversity.alpha.singles(counts)
     N = counts.sum()
     return 1 - (f1 / N)
 
@@ -206,17 +208,7 @@ def _strong(counts):
     return (sorted_sum / n - (i / s)).max()
 
 
-def _singles(counts):
-    counts = _validate_counts_vector(counts)
-    return (counts == 1).sum()
-
-
-def _doubles(counts):
-    counts = _validate_counts_vector(counts)
-    return (counts == 2).sum()
-
-
-def _p_evenness(counts, sample_ids):
+def _p_evenness(counts):
     counts = _validate_counts_vector(counts)
     return _shannon(counts, base=np.e) / np.log(
         skbio.diversity.alpha.sobs(counts=counts))

diff --git a/q2_diversity_lib/tests/test_alpha.py b/q2_diversity_lib/tests/test_alpha.py
@@ -9,6 +9,7 @@
 from subprocess import CalledProcessError
 
 import numpy as np
+import numpy.testing as npt
 import pandas as pd
 import pandas.testing as pdt
 import biom
@@ -17,7 +18,12 @@
 from qiime2 import Artifact
 
 from ..alpha import (pielou_evenness, observed_features,
-                     shannon_entropy, METRICS)
+                     shannon_entropy, METRICS,
+                     _berger_parker, _brillouin_d,
+                     _simpsons_dominance, _esty_ci,
+                     _goods_coverage, _margalef,
+                     _mcintosh_d, _strong
+                     )
 
 
 class SmokeTests(TestPluginBase):
@@ -154,7 +160,9 @@ def test_drop_undefined_samples(self):
                                          [0, 0, 0, 1, 0, 1]]),
                                ['A', 'B', 'C'],
                                ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
-        expected = pd.Series({'S5': 1, 'S6': 1}, name='pielou_evenness')
+        # pandas supports floating point correction for float dtype only,
+        # these 1 ints were changed to 1.0 floats to get that support
+        expected = pd.Series({'S5': 1.0, 'S6': 1.0}, name='pielou_evenness')
         actual = pielou_evenness(table=NaN_table, drop_undefined_samples=True)
         pdt.assert_series_equal(actual, expected, check_dtype=False)
 
@@ -250,3 +258,68 @@ def test_passed_implemented_metric(self):
         for metric in METRICS['NONPHYLO']['IMPL']:
             with self.assertRaisesRegex(TypeError, f"{metric}.*incompatible"):
                 self.method(table=self.crawford_tbl, metric=metric)
+
+# tests for passthrough metrics were sourced from skbio
+    def test_berger_parker_d(self):
+        self.assertEqual(_berger_parker(np.array([5, 5])), 0.5)
+        self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25)
+
+    def test_brillouin_d(self):
+        self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])),
+                               0.86289353018248782)
+
+    def test_esty_ci(self):
+        def _diversity(indices, f):
+            """Calculate diversity index for each window of size 1.
+
+            indices: vector of indices of taxa
+            f: f(counts) -> diversity measure
+
+            """
+            result = []
+            max_size = max(indices) + 1
+            freqs = np.zeros(max_size, dtype=int)
+            for i in range(len(indices)):
+                freqs += np.bincount(indices[i:i + 1], minlength=max_size)
+                try:
+                    curr = f(freqs)
+                except (ZeroDivisionError, FloatingPointError):
+                    curr = 0
+                result.append(curr)
+            return np.array(result)
+
+        data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4]
+
+        observed_lower, observed_upper = zip(*_diversity(data, _esty_ci))
+
+        expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465,
+                                   -0.15060902, -0.04386191, -0.33042054,
+                                   -0.29041008, -0.43554755, -0.33385652])
+        expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465,
+                                   0.55060902, 0.71052858, 0.61613483,
+                                   0.54041008, 0.43554755, 0.53385652])
+
+        npt.assert_array_almost_equal(observed_lower, expected_lower)
+        npt.assert_array_almost_equal(observed_upper, expected_upper)
+
+    def test_simpson(self):
+        self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])),
+                               0.66)
+        self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0)
+
+    def test_goods_coverage(self):
+        counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4]
+        obs = _goods_coverage(counts)
+        self.assertAlmostEqual(obs, 0.23469387755)
+
+    def test_margalef(self):
+
+        self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])),
+                         8 / np.log(22))
+
+    def test_mcintosh_d(self):
+        self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])),
+                               0.636061424871458)
+
+    def test_strong(self):
+        self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)