diff --git a/q2_diversity_lib/alpha.py b/q2_diversity_lib/alpha.py index e5f8bd8..a67644a 100644 --- a/q2_diversity_lib/alpha.py +++ b/q2_diversity_lib/alpha.py @@ -11,11 +11,8 @@ import functools import skbio.diversity -from skbio.diversity._util import _validate_counts_vector import skbio.diversity.alpha -from scipy.special import gammaln - import biom from q2_types.feature_table import BIOMV210Format @@ -26,6 +23,11 @@ _validate_requested_cpus, _omp_cmd_wrapper) +from q2_diversity_lib.skbio._methods import (_berger_parker, _brillouin_d, + _simpsons_dominance, _esty_ci, + _goods_coverage, _margalef, + _mcintosh_d, _strong, _shannon, + _p_evenness) METRICS = { 'PHYLO': { @@ -145,76 +147,3 @@ def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series: metric)) results = pd.Series(results, index=table.ids(), name=metric) return results - - -# c&p methods from skbio -def _berger_parker(counts): - counts = _validate_counts_vector(counts) - return counts.max() / counts.sum() - - -def _brillouin_d(counts): - counts = _validate_counts_vector(counts) - nz = counts[counts.nonzero()] - n = nz.sum() - return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n - - -def _simpsons_dominance(counts): - counts = _validate_counts_vector(counts) - return 1 - skbio.diversity.alpha.dominance(counts) - - -def _esty_ci(counts): - counts = _validate_counts_vector(counts) - - f1 = skbio.diversity.alpha.singles(counts) - f2 = skbio.diversity.alpha.doubles(counts) - n = counts.sum() - z = 1.959963985 - W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3) - - return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W) - - -def _goods_coverage(counts): - counts = _validate_counts_vector(counts) - f1 = skbio.diversity.alpha.singles(counts) - N = counts.sum() - return 1 - (f1 / N) - - -def _margalef(counts): - counts = _validate_counts_vector(counts) - # replaced observed_otu call to sobs - return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum()) - - -def _mcintosh_d(counts): - counts = _validate_counts_vector(counts) - u = np.sqrt((counts * counts).sum()) - n = counts.sum() - return (n - u) / (n - np.sqrt(n)) - - -def _strong(counts): - counts = _validate_counts_vector(counts) - n = counts.sum() - # replaced observed_otu call to sobs - s = skbio.diversity.alpha.sobs(counts) - i = np.arange(1, len(counts) + 1) - sorted_sum = np.sort(counts)[::-1].cumsum() - return (sorted_sum / n - (i / s)).max() - - -def _p_evenness(counts): - counts = _validate_counts_vector(counts) - return _shannon(counts, base=np.e) / np.log( - skbio.diversity.alpha.sobs(counts=counts)) - - -def _shannon(counts, base=2): - counts = _validate_counts_vector(counts) - freqs = counts / counts.sum() - nonzero_freqs = freqs[freqs.nonzero()] - return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base) diff --git a/q2_diversity_lib/skbio/LICENSE b/q2_diversity_lib/skbio/LICENSE new file mode 100644 index 0000000..cdeb56f --- /dev/null +++ b/q2_diversity_lib/skbio/LICENSE @@ -0,0 +1,29 @@ +# sourced from https://github.com/scikit-bio/scikit-bio/blob/main/LICENSE.txt + +Copyright (c) 2013--, scikit-bio development team. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +* Neither the names scikit-bio, skbio, or biocore nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/q2_diversity_lib/skbio/__init__.py b/q2_diversity_lib/skbio/__init__.py new file mode 100644 index 0000000..7f5ff2e --- /dev/null +++ b/q2_diversity_lib/skbio/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2013--, scikit-bio development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_diversity_lib/skbio/_methods.py b/q2_diversity_lib/skbio/_methods.py new file mode 100644 index 0000000..339843d --- /dev/null +++ b/q2_diversity_lib/skbio/_methods.py @@ -0,0 +1,79 @@ +import numpy as np + +from skbio.diversity._util import _validate_counts_vector +import skbio.diversity.alpha + +from scipy.special import gammaln + + +# c&p methods from skbio +def _berger_parker(counts): + counts = _validate_counts_vector(counts) + return counts.max() / counts.sum() + + +def _brillouin_d(counts): + counts = _validate_counts_vector(counts) + nz = counts[counts.nonzero()] + n = nz.sum() + return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n + + +def _simpsons_dominance(counts): + counts = _validate_counts_vector(counts) + return 1 - skbio.diversity.alpha.dominance(counts) + + +def _esty_ci(counts): + counts = _validate_counts_vector(counts) + + f1 = skbio.diversity.alpha.singles(counts) + f2 = skbio.diversity.alpha.doubles(counts) + n = counts.sum() + z = 1.959963985 + W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3) + + return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W) + + +def _goods_coverage(counts): + counts = _validate_counts_vector(counts) + f1 = skbio.diversity.alpha.singles(counts) + N = counts.sum() + return 1 - (f1 / N) + + +def _margalef(counts): + counts = _validate_counts_vector(counts) + # replaced observed_otu call to sobs + return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum()) + + +def _mcintosh_d(counts): + counts = _validate_counts_vector(counts) + u = np.sqrt((counts * counts).sum()) + n = counts.sum() + return (n - u) / (n - np.sqrt(n)) + + +def _strong(counts): + counts = _validate_counts_vector(counts) + n = counts.sum() + # replaced observed_otu call to sobs + s = skbio.diversity.alpha.sobs(counts) + i = np.arange(1, len(counts) + 1) + sorted_sum = np.sort(counts)[::-1].cumsum() + return (sorted_sum / n - (i / s)).max() + + +def _p_evenness(counts): + counts = _validate_counts_vector(counts) + return _shannon(counts, base=np.e) / np.log( + skbio.diversity.alpha.sobs(counts=counts)) + + +def _shannon(counts, base=2): + counts = _validate_counts_vector(counts) + freqs = counts / counts.sum() + nonzero_freqs = freqs[freqs.nonzero()] + return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base) diff --git a/q2_diversity_lib/skbio/test_methods.py b/q2_diversity_lib/skbio/test_methods.py new file mode 100644 index 0000000..8215cab --- /dev/null +++ b/q2_diversity_lib/skbio/test_methods.py @@ -0,0 +1,76 @@ +import numpy as np +import numpy.testing as npt + +from qiime2.plugin.testing import TestPluginBase + +from q2_diversity_lib.skbio._methods import (_berger_parker, _brillouin_d, + _simpsons_dominance, _esty_ci, + _goods_coverage, _margalef, + _mcintosh_d, _strong) + + +class SkbioTests(TestPluginBase): + package = 'q2_diversity_lib.skbio' + +# tests for passthrough metrics were sourced from skbio + def test_berger_parker_d(self): + self.assertEqual(_berger_parker(np.array([5, 5])), 0.5) + self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25) + + def test_brillouin_d(self): + self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])), + 0.86289353018248782) + + def test_esty_ci(self): + def _diversity(indices, f): + """Calculate diversity index for each window of size 1. + indices: vector of indices of taxa + f: f(counts) -> diversity measure + """ + result = [] + max_size = max(indices) + 1 + freqs = np.zeros(max_size, dtype=int) + for i in range(len(indices)): + freqs += np.bincount(indices[i:i + 1], minlength=max_size) + try: + curr = f(freqs) + except (ZeroDivisionError, FloatingPointError): + curr = 0 + result.append(curr) + return np.array(result) + + data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4] + + observed_lower, observed_upper = zip(*_diversity(data, _esty_ci)) + + expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465, + -0.15060902, -0.04386191, -0.33042054, + -0.29041008, -0.43554755, -0.33385652]) + expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465, + 0.55060902, 0.71052858, 0.61613483, + 0.54041008, 0.43554755, 0.53385652]) + + npt.assert_array_almost_equal(observed_lower, expected_lower) + npt.assert_array_almost_equal(observed_upper, expected_upper) + + def test_simpson(self): + self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])), + 0.66) + self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0) + + def test_goods_coverage(self): + counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4] + obs = _goods_coverage(counts) + self.assertAlmostEqual(obs, 0.23469387755) + + def test_margalef(self): + + self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])), + 8 / np.log(22)) + + def test_mcintosh_d(self): + self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])), + 0.636061424871458) + + def test_strong(self): + self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)