MAINT: moves skbio methods to separate subdir with copyright info (#66)

qiime2 · May 15, 2024 · 62b5ec3 · 62b5ec3
1 parent 983125e
commit 62b5ec3
Show file tree

Hide file tree

Showing 5 changed files with 196 additions and 76 deletions.
diff --git a/q2_diversity_lib/alpha.py b/q2_diversity_lib/alpha.py
@@ -11,11 +11,8 @@
 import functools
 
 import skbio.diversity
-from skbio.diversity._util import _validate_counts_vector
 import skbio.diversity.alpha
 
-from scipy.special import gammaln
-
 import biom
 
 from q2_types.feature_table import BIOMV210Format
@@ -26,6 +23,11 @@
                     _validate_requested_cpus,
                     _omp_cmd_wrapper)
 
+from q2_diversity_lib.skbio._methods import (_berger_parker, _brillouin_d,
+                                             _simpsons_dominance, _esty_ci,
+                                             _goods_coverage, _margalef,
+                                             _mcintosh_d, _strong, _shannon,
+                                             _p_evenness)
 
 METRICS = {
     'PHYLO': {
@@ -145,76 +147,3 @@ def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
                                                           metric))
     results = pd.Series(results, index=table.ids(), name=metric)
     return results
-
-
-# c&p methods from skbio
-def _berger_parker(counts):
-    counts = _validate_counts_vector(counts)
-    return counts.max() / counts.sum()
-
-
-def _brillouin_d(counts):
-    counts = _validate_counts_vector(counts)
-    nz = counts[counts.nonzero()]
-    n = nz.sum()
-    return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n
-
-
-def _simpsons_dominance(counts):
-    counts = _validate_counts_vector(counts)
-    return 1 - skbio.diversity.alpha.dominance(counts)
-
-
-def _esty_ci(counts):
-    counts = _validate_counts_vector(counts)
-
-    f1 = skbio.diversity.alpha.singles(counts)
-    f2 = skbio.diversity.alpha.doubles(counts)
-    n = counts.sum()
-    z = 1.959963985
-    W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)
-
-    return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W)
-
-
-def _goods_coverage(counts):
-    counts = _validate_counts_vector(counts)
-    f1 = skbio.diversity.alpha.singles(counts)
-    N = counts.sum()
-    return 1 - (f1 / N)
-
-
-def _margalef(counts):
-    counts = _validate_counts_vector(counts)
-    # replaced observed_otu call to sobs
-    return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum())
-
-
-def _mcintosh_d(counts):
-    counts = _validate_counts_vector(counts)
-    u = np.sqrt((counts * counts).sum())
-    n = counts.sum()
-    return (n - u) / (n - np.sqrt(n))
-
-
-def _strong(counts):
-    counts = _validate_counts_vector(counts)
-    n = counts.sum()
-    # replaced observed_otu call to sobs
-    s = skbio.diversity.alpha.sobs(counts)
-    i = np.arange(1, len(counts) + 1)
-    sorted_sum = np.sort(counts)[::-1].cumsum()
-    return (sorted_sum / n - (i / s)).max()
-
-
-def _p_evenness(counts):
-    counts = _validate_counts_vector(counts)
-    return _shannon(counts, base=np.e) / np.log(
-        skbio.diversity.alpha.sobs(counts=counts))
-
-
-def _shannon(counts, base=2):
-    counts = _validate_counts_vector(counts)
-    freqs = counts / counts.sum()
-    nonzero_freqs = freqs[freqs.nonzero()]
-    return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base)
diff --git a/q2_diversity_lib/skbio/LICENSE b/q2_diversity_lib/skbio/LICENSE
@@ -0,0 +1,29 @@
+# sourced from https://github.com/scikit-bio/scikit-bio/blob/main/LICENSE.txt
+
+Copyright (c) 2013--, scikit-bio development team.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the names scikit-bio, skbio, or biocore nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/q2_diversity_lib/skbio/__init__.py b/q2_diversity_lib/skbio/__init__.py
@@ -0,0 +1,7 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
diff --git a/q2_diversity_lib/skbio/_methods.py b/q2_diversity_lib/skbio/_methods.py
@@ -0,0 +1,79 @@
+import numpy as np
+
+from skbio.diversity._util import _validate_counts_vector
+import skbio.diversity.alpha
+
+from scipy.special import gammaln
+
+
+# c&p methods from skbio
+def _berger_parker(counts):
+    counts = _validate_counts_vector(counts)
+    return counts.max() / counts.sum()
+
+
+def _brillouin_d(counts):
+    counts = _validate_counts_vector(counts)
+    nz = counts[counts.nonzero()]
+    n = nz.sum()
+    return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n
+
+
+def _simpsons_dominance(counts):
+    counts = _validate_counts_vector(counts)
+    return 1 - skbio.diversity.alpha.dominance(counts)
+
+
+def _esty_ci(counts):
+    counts = _validate_counts_vector(counts)
+
+    f1 = skbio.diversity.alpha.singles(counts)
+    f2 = skbio.diversity.alpha.doubles(counts)
+    n = counts.sum()
+    z = 1.959963985
+    W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)
+
+    return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W)
+
+
+def _goods_coverage(counts):
+    counts = _validate_counts_vector(counts)
+    f1 = skbio.diversity.alpha.singles(counts)
+    N = counts.sum()
+    return 1 - (f1 / N)
+
+
+def _margalef(counts):
+    counts = _validate_counts_vector(counts)
+    # replaced observed_otu call to sobs
+    return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum())
+
+
+def _mcintosh_d(counts):
+    counts = _validate_counts_vector(counts)
+    u = np.sqrt((counts * counts).sum())
+    n = counts.sum()
+    return (n - u) / (n - np.sqrt(n))
+
+
+def _strong(counts):
+    counts = _validate_counts_vector(counts)
+    n = counts.sum()
+    # replaced observed_otu call to sobs
+    s = skbio.diversity.alpha.sobs(counts)
+    i = np.arange(1, len(counts) + 1)
+    sorted_sum = np.sort(counts)[::-1].cumsum()
+    return (sorted_sum / n - (i / s)).max()
+
+
+def _p_evenness(counts):
+    counts = _validate_counts_vector(counts)
+    return _shannon(counts, base=np.e) / np.log(
+        skbio.diversity.alpha.sobs(counts=counts))
+
+
+def _shannon(counts, base=2):
+    counts = _validate_counts_vector(counts)
+    freqs = counts / counts.sum()
+    nonzero_freqs = freqs[freqs.nonzero()]
+    return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base)
diff --git a/q2_diversity_lib/skbio/test_methods.py b/q2_diversity_lib/skbio/test_methods.py
@@ -0,0 +1,76 @@
+import numpy as np
+import numpy.testing as npt
+
+from qiime2.plugin.testing import TestPluginBase
+
+from q2_diversity_lib.skbio._methods import (_berger_parker, _brillouin_d,
+                                             _simpsons_dominance, _esty_ci,
+                                             _goods_coverage, _margalef,
+                                             _mcintosh_d, _strong)
+
+
+class SkbioTests(TestPluginBase):
+    package = 'q2_diversity_lib.skbio'
+
+# tests for passthrough metrics were sourced from skbio
+    def test_berger_parker_d(self):
+        self.assertEqual(_berger_parker(np.array([5, 5])), 0.5)
+        self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25)
+
+    def test_brillouin_d(self):
+        self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])),
+                               0.86289353018248782)
+
+    def test_esty_ci(self):
+        def _diversity(indices, f):
+            """Calculate diversity index for each window of size 1.
+            indices: vector of indices of taxa
+            f: f(counts) -> diversity measure
+            """
+            result = []
+            max_size = max(indices) + 1
+            freqs = np.zeros(max_size, dtype=int)
+            for i in range(len(indices)):
+                freqs += np.bincount(indices[i:i + 1], minlength=max_size)
+                try:
+                    curr = f(freqs)
+                except (ZeroDivisionError, FloatingPointError):
+                    curr = 0
+                result.append(curr)
+            return np.array(result)
+
+        data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4]
+
+        observed_lower, observed_upper = zip(*_diversity(data, _esty_ci))
+
+        expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465,
+                                   -0.15060902, -0.04386191, -0.33042054,
+                                   -0.29041008, -0.43554755, -0.33385652])
+        expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465,
+                                   0.55060902, 0.71052858, 0.61613483,
+                                   0.54041008, 0.43554755, 0.53385652])
+
+        npt.assert_array_almost_equal(observed_lower, expected_lower)
+        npt.assert_array_almost_equal(observed_upper, expected_upper)
+
+    def test_simpson(self):
+        self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])),
+                               0.66)
+        self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0)
+
+    def test_goods_coverage(self):
+        counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4]
+        obs = _goods_coverage(counts)
+        self.assertAlmostEqual(obs, 0.23469387755)
+
+    def test_margalef(self):
+
+        self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])),
+                         8 / np.log(22))
+
+    def test_mcintosh_d(self):
+        self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])),
+                               0.636061424871458)
+
+    def test_strong(self):
+        self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)