diff --git a/q2_diversity_lib/alpha.py b/q2_diversity_lib/alpha.py index 7dc5b6c..e5f8bd8 100644 --- a/q2_diversity_lib/alpha.py +++ b/q2_diversity_lib/alpha.py @@ -7,9 +7,16 @@ # ---------------------------------------------------------------------------- import pandas as pd +import numpy as np +import functools + import skbio.diversity +from skbio.diversity._util import _validate_counts_vector +import skbio.diversity.alpha + +from scipy.special import gammaln + import biom -import numpy as np from q2_types.feature_table import BIOMV210Format from q2_types.sample_data import AlphaDiversityFormat @@ -91,7 +98,10 @@ def transform_(v, i, m): results = [] for v in table.iter_data(dense=True): - results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e')) + # using in-house metrics temporarily + # results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e')) + v = np.reshape(v, (1, len(v))) + results.extend([_p_evenness(c)for c in v]) results = pd.Series(results, index=table.ids(), name='pielou_evenness') return results @@ -104,7 +114,10 @@ def shannon_entropy(table: biom.Table, results = [] for v in table.iter_data(dense=True): - results.append(_skbio_alpha_diversity_from_1d(v, 'shannon')) + # using in-house metrics temporarily + # results.append(_skbio_alpha_diversity_from_1d(v, 'shannon')) + v = np.reshape(v, (1, len(v))) + results.extend([_shannon(c)for c in v]) results = pd.Series(results, index=table.ids(), name='shannon_entropy') return results @@ -112,8 +125,96 @@ def shannon_entropy(table: biom.Table, @_validate_tables def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series: results = [] - - for v in table.iter_data(dense=True): - results.append(_skbio_alpha_diversity_from_1d(v.astype(int), metric)) + method_map = {"berger_parker_d": _berger_parker, + "brillouin_d": _brillouin_d, + "simpson": _simpsons_dominance, + "esty_ci": _esty_ci, + "goods_coverage": _goods_coverage, + "margalef": _margalef, + "mcintosh_d": _mcintosh_d, + "strong": _strong} + + if metric in method_map: + metric = functools.partial(method_map[metric]) + for v in table.iter_data(dense=True): + v = np.reshape(v, (1, len(v))) + results.extend([metric(c)for c in v]) + else: + for v in table.iter_data(dense=True): + results.append(_skbio_alpha_diversity_from_1d(v.astype(int), + metric)) results = pd.Series(results, index=table.ids(), name=metric) return results + + +# c&p methods from skbio +def _berger_parker(counts): + counts = _validate_counts_vector(counts) + return counts.max() / counts.sum() + + +def _brillouin_d(counts): + counts = _validate_counts_vector(counts) + nz = counts[counts.nonzero()] + n = nz.sum() + return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n + + +def _simpsons_dominance(counts): + counts = _validate_counts_vector(counts) + return 1 - skbio.diversity.alpha.dominance(counts) + + +def _esty_ci(counts): + counts = _validate_counts_vector(counts) + + f1 = skbio.diversity.alpha.singles(counts) + f2 = skbio.diversity.alpha.doubles(counts) + n = counts.sum() + z = 1.959963985 + W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3) + + return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W) + + +def _goods_coverage(counts): + counts = _validate_counts_vector(counts) + f1 = skbio.diversity.alpha.singles(counts) + N = counts.sum() + return 1 - (f1 / N) + + +def _margalef(counts): + counts = _validate_counts_vector(counts) + # replaced observed_otu call to sobs + return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum()) + + +def _mcintosh_d(counts): + counts = _validate_counts_vector(counts) + u = np.sqrt((counts * counts).sum()) + n = counts.sum() + return (n - u) / (n - np.sqrt(n)) + + +def _strong(counts): + counts = _validate_counts_vector(counts) + n = counts.sum() + # replaced observed_otu call to sobs + s = skbio.diversity.alpha.sobs(counts) + i = np.arange(1, len(counts) + 1) + sorted_sum = np.sort(counts)[::-1].cumsum() + return (sorted_sum / n - (i / s)).max() + + +def _p_evenness(counts): + counts = _validate_counts_vector(counts) + return _shannon(counts, base=np.e) / np.log( + skbio.diversity.alpha.sobs(counts=counts)) + + +def _shannon(counts, base=2): + counts = _validate_counts_vector(counts) + freqs = counts / counts.sum() + nonzero_freqs = freqs[freqs.nonzero()] + return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base) diff --git a/q2_diversity_lib/beta.py b/q2_diversity_lib/beta.py index 07a0304..7c01cea 100644 --- a/q2_diversity_lib/beta.py +++ b/q2_diversity_lib/beta.py @@ -38,7 +38,7 @@ 'IMPL': {'braycurtis', 'jaccard'}, 'UNIMPL': {'cityblock', 'euclidean', 'seuclidean', 'sqeuclidean', 'cosine', 'correlation', 'hamming', 'chebyshev', 'canberra', - 'yule', 'matching', 'dice', 'kulsinski', + 'yule', 'matching', 'dice', 'rogerstanimoto', 'russellrao', 'sokalmichener', 'sokalsneath', 'minkowski', 'aitchison', 'canberra_adkins', 'jensenshannon'} diff --git a/q2_diversity_lib/examples.py b/q2_diversity_lib/examples.py index b7ecb75..ea1048c 100644 --- a/q2_diversity_lib/examples.py +++ b/q2_diversity_lib/examples.py @@ -322,8 +322,8 @@ def beta_passthrough_n_jobs_example(use): result, = use.action( use.UsageAction(plugin_id='diversity_lib', action_id='beta_passthrough'), - use.UsageInputs(table=ft, metric='kulsinski', n_jobs=1), - use.UsageOutputNames(distance_matrix='kulsinski_dm') + use.UsageInputs(table=ft, metric='euclidean', n_jobs=1), + use.UsageOutputNames(distance_matrix='euclidean_dm') ) result.assert_output_type('DistanceMatrix') diff --git a/q2_diversity_lib/tests/test_alpha.py b/q2_diversity_lib/tests/test_alpha.py index 51e288b..3267535 100644 --- a/q2_diversity_lib/tests/test_alpha.py +++ b/q2_diversity_lib/tests/test_alpha.py @@ -9,6 +9,7 @@ from subprocess import CalledProcessError import numpy as np +import numpy.testing as npt import pandas as pd import pandas.testing as pdt import biom @@ -17,7 +18,12 @@ from qiime2 import Artifact from ..alpha import (pielou_evenness, observed_features, - shannon_entropy, METRICS) + shannon_entropy, METRICS, + _berger_parker, _brillouin_d, + _simpsons_dominance, _esty_ci, + _goods_coverage, _margalef, + _mcintosh_d, _strong + ) class SmokeTests(TestPluginBase): @@ -154,7 +160,9 @@ def test_drop_undefined_samples(self): [0, 0, 0, 1, 0, 1]]), ['A', 'B', 'C'], ['S1', 'S2', 'S3', 'S4', 'S5', 'S6']) - expected = pd.Series({'S5': 1, 'S6': 1}, name='pielou_evenness') + # pandas supports floating point correction for float dtype only, + # these 1 ints were changed to 1.0 floats to get that support + expected = pd.Series({'S5': 1.0, 'S6': 1.0}, name='pielou_evenness') actual = pielou_evenness(table=NaN_table, drop_undefined_samples=True) pdt.assert_series_equal(actual, expected, check_dtype=False) @@ -250,3 +258,68 @@ def test_passed_implemented_metric(self): for metric in METRICS['NONPHYLO']['IMPL']: with self.assertRaisesRegex(TypeError, f"{metric}.*incompatible"): self.method(table=self.crawford_tbl, metric=metric) + +# tests for passthrough metrics were sourced from skbio + def test_berger_parker_d(self): + self.assertEqual(_berger_parker(np.array([5, 5])), 0.5) + self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25) + + def test_brillouin_d(self): + self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])), + 0.86289353018248782) + + def test_esty_ci(self): + def _diversity(indices, f): + """Calculate diversity index for each window of size 1. + + indices: vector of indices of taxa + f: f(counts) -> diversity measure + + """ + result = [] + max_size = max(indices) + 1 + freqs = np.zeros(max_size, dtype=int) + for i in range(len(indices)): + freqs += np.bincount(indices[i:i + 1], minlength=max_size) + try: + curr = f(freqs) + except (ZeroDivisionError, FloatingPointError): + curr = 0 + result.append(curr) + return np.array(result) + + data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4] + + observed_lower, observed_upper = zip(*_diversity(data, _esty_ci)) + + expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465, + -0.15060902, -0.04386191, -0.33042054, + -0.29041008, -0.43554755, -0.33385652]) + expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465, + 0.55060902, 0.71052858, 0.61613483, + 0.54041008, 0.43554755, 0.53385652]) + + npt.assert_array_almost_equal(observed_lower, expected_lower) + npt.assert_array_almost_equal(observed_upper, expected_upper) + + def test_simpson(self): + self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])), + 0.66) + self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0) + + def test_goods_coverage(self): + counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4] + obs = _goods_coverage(counts) + self.assertAlmostEqual(obs, 0.23469387755) + + def test_margalef(self): + + self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])), + 8 / np.log(22)) + + def test_mcintosh_d(self): + self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])), + 0.636061424871458) + + def test_strong(self): + self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)