From 7ba35baf0d40b8b9d9d5fa22ffe48b6fb11bc7d3 Mon Sep 17 00:00:00 2001 From: Dougie Squire <42455466+dougiesquire@users.noreply.github.com> Date: Fri, 25 Sep 2020 04:52:25 +1000 Subject: [PATCH] Make `bins` definition consistent with `np.histogram` (#20) * force last bin to be right edge inclusive * added explanation to docstrings --- doc/contributing.rst | 6 ++++++ xhistogram/core.py | 12 ++++++++++++ xhistogram/test/test_core.py | 21 +++++++++++++++++++++ xhistogram/xarray.py | 3 +++ 4 files changed, 42 insertions(+) diff --git a/doc/contributing.rst b/doc/contributing.rst index cc351d3..80841b5 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -8,6 +8,12 @@ GitHub repo: `https://github.com/xgcm/xhistogram `_. + v0.1.1 ~~~~~~ diff --git a/xhistogram/core.py b/xhistogram/core.py index 9f992c2..559d1a2 100644 --- a/xhistogram/core.py +++ b/xhistogram/core.py @@ -107,6 +107,15 @@ def _histogram_2d_vectorized(*args, bins=None, weights=None, density=False, # https://github.com/numpy/numpy/blob/9c98662ee2f7daca3f9fae9d5144a9a8d3cabe8c/numpy/lib/histograms.py#L864-L882 # for now we stick with `digitize` because it's easy to understand how it works + # Add small increment to the last bin edge to make the final bin right-edge inclusive + # Note, this is the approach taken by sklearn, e.g. + # https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/calibration.py#L592 + # but a better approach would be to use something like _search_sorted_inclusive() in + # numpy histogram. This is an additional motivation for moving to searchsorted + bins = [np.concatenate(( + b[:-1], + b[-1:] + 1e-8)) for b in bins] + # the maximum possible value of of digitize is nbins # for right=False: # - 0 corresponds to a < b[0] @@ -154,6 +163,9 @@ def histogram(*args, bins=None, axis=None, weights=None, density=False, * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges. + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + A ``TypeError`` will be raised if ``args`` contains dask arrays and ``bins`` are not specified explicitly as a list of arrays. axis : None or int or tuple of ints, optional diff --git a/xhistogram/test/test_core.py b/xhistogram/test/test_core.py index a5b0a05..e4802fb 100644 --- a/xhistogram/test/test_core.py +++ b/xhistogram/test/test_core.py @@ -79,6 +79,27 @@ def test_histogram_results_1d_weighted_broadcasting(block_size): np.testing.assert_array_equal(2*h, h_w) +@pytest.mark.parametrize('block_size', [None, 1, 2]) +def test_histogram_right_edge(block_size): + """Test that last bin is both left- and right-edge inclusive as it + is for numpy.histogram + """ + nrows, ncols = 5, 20 + data = np.ones((nrows, ncols)) + bins = np.array([0, 0.5, 1]) # All data at rightmost edge + + h = histogram(data, bins=bins, axis=1, block_size=block_size) + assert h.shape == (nrows, len(bins)-1) + + # make sure we get the same thing as histogram (all data in the last bin) + hist, _ = np.histogram(data, bins=bins) + np.testing.assert_array_equal(hist, h.sum(axis=0)) + + # now try with no axis + h_na = histogram(data, bins=bins, block_size=block_size) + np.testing.assert_array_equal(hist, h_na) + + def test_histogram_results_2d(): nrows, ncols = 5, 20 data_a = np.random.randn(nrows, ncols) diff --git a/xhistogram/xarray.py b/xhistogram/xarray.py index cd9b65f..2dc8ba9 100644 --- a/xhistogram/xarray.py +++ b/xhistogram/xarray.py @@ -31,6 +31,9 @@ def histogram(*args, bins=None, dim=None, weights=None, density=False, * A combination [int, array] or [array, int], where int is the number of bins and array is the bin edges. + When bin edges are specified, all but the last (righthand-most) bin include + the left edge and exclude the right edge. The last bin includes both edges. + A ``TypeError`` will be raised if ``args`` contains dask arrays and ``bins`` are not specified explicitly as a list of arrays. dim : tuple of strings, optional