diff --git a/estimage/statops/compare.py b/estimage/statops/compare.py new file mode 100644 index 0000000..31520eb --- /dev/null +++ b/estimage/statops/compare.py @@ -0,0 +1,71 @@ +import numpy as np +import scipy as sp + + +# \int_{-\inf}^{\inf}l(x)\int_{x}^{\inf}g(t)\, \textup{d}t\, \textup{d}x +# where l is the function that we think should be lower, and g the greater +def _integrate(dom, one, two): + res = 0 + for i, x in enumerate(dom): + toadd = two[i] * 0.5 + toadd += sum(two[i + 1:]) + toadd *= one[i] + res += toadd + return res + + +def _integrate_estimate_both_degenerate(one, two): + if one.expected == two.expected: + return 0.5 + if one.expected < two.expected: + return 1 + return 0 + + +def _integrate_estimate_second_degenerate(one, two): + threshold = two.expected + rv = one._get_rv() + return rv.cdf(threshold) + + +def _integrate_estimate_first_degenerate(one, two): + threshold = one.expected + rv = two._get_rv() + return 1 - rv.cdf(threshold) + + +INTEGRATION_PRECISION = 0.001 + + +# \int_{-\inf}^{\inf}l(x)\int_{x}^{\inf}g(t)\, \textup{d}t\, \textup{d}x is in fact +# \int_{-\inf}^{\inf}l(x)\, \textup{d}x - \int_{-\inf}^{\inf}l(x)G(x)\, \textup{d}x = +# = 1 - \int_{-\inf}^{\inf}l(x)G(x)\, \textup{d}x where +# G(x) is distribution function of the g randvar +def _integrate_estimate(one, two): + # unlike upper bound case, PDF and CDF are zero below their respective lower bound + effective_lower_bound = max(one.source.optimistic, two.source.optimistic) + safe_upper_bound = max(one.source.pessimistic, two.source.pessimistic) + lower_rv = one._get_rv() + upper_rv = two._get_rv() + def integrand(x): return lower_rv.pdf(x) * upper_rv.cdf(x) + result = sp.integrate.quad(integrand, effective_lower_bound, safe_upper_bound, epsabs=INTEGRATION_PRECISION) + return 1 - result[0] + + +def is_lower(dom, one, two): + dom = np.array(dom, dtype=float) + one = np.array(one, dtype=float) + one /= one.sum() + two = np.array(two, dtype=float) + two /= two.sum() + return _integrate(dom, one, two) + + +def estimate_is_lower(one, two): + if one.sigma == 0 and two.sigma == 0: + return _integrate_estimate_both_degenerate(one, two) + if two.sigma == 0: + return _integrate_estimate_second_degenerate(one, two) + if one.sigma == 0: + return _integrate_estimate_first_degenerate(one, two) + return _integrate_estimate(one, two) diff --git a/tests/test_statops_compare.py b/tests/test_statops_compare.py new file mode 100644 index 0000000..70bf8fe --- /dev/null +++ b/tests/test_statops_compare.py @@ -0,0 +1,76 @@ +import estimage.statops.compare as tm +import estimage.data as data + +import pytest + + +def montecarlo_is_lower_test(one, two, samples=1000): + rvs_one, rvs_two = [est._get_rv().rvs(samples) for est in (one, two)] + return sum(rvs_one < rvs_two) / samples + + +def integ_approx(x): + return pytest.approx(x, abs=tm.INTEGRATION_PRECISION) + + +def test_compare_trivial(): + assert tm.is_lower([0], [1], [1]) == 0.5 + degenerate_estimate = data.Estimate(1, 0) + assert tm.estimate_is_lower(degenerate_estimate, degenerate_estimate) == 0.5 + + +def test_compare_equal(): + assert tm.is_lower([0, 1], [1, 1], [1, 1]) == 0.5 + assert tm.is_lower([0, 1, 2], [1, 1, 1], [1, 1, 1]) == 0.5 + assert tm.is_lower([0, 1, 2, 3], [1, 1, 1, 1], [1, 1, 1, 1]) == 0.5 + assert tm.is_lower([0, 1, 2], [0, 1, 0], [1, 1, 1]) == 0.5 + simple_estimate = data.Estimate.from_triple(1, 0, 2) + assert tm.estimate_is_lower(simple_estimate, simple_estimate) == 0.5 + concrete_estimate = data.Estimate.from_triple(2, 1, 3) + assert tm.estimate_is_lower(concrete_estimate, concrete_estimate) == 0.5 + assert montecarlo_is_lower_test(concrete_estimate, concrete_estimate) == pytest.approx(0.5, abs=0.05) + + +def test_compare_clear(): + assert tm.is_lower([0, 1], [0, 1], [1, 0]) == 0 + assert tm.is_lower([0, 1], [1, 0], [0, 1]) == 1 + assert tm.estimate_is_lower(data.Estimate(1, 0), data.Estimate(0, 0)) == 0 + assert tm.estimate_is_lower(data.Estimate(0, 0), data.Estimate(1, 0)) == 1 + assert tm.is_lower([0, 1, 2, 3], [1, 1, 0, 0], [0, 0, 1, 1]) == 1 + assert tm.is_lower([0, 1, 2, 3], [0, 0, 1, 1], [1, 1, 0, 0]) == 0 + low_estimate = data.Estimate.from_triple(1, 0, 3) + high_estimate = data.Estimate.from_triple(4, 3, 6) + assert tm.estimate_is_lower(low_estimate, high_estimate) == integ_approx(1) + assert tm.estimate_is_lower(high_estimate, low_estimate) == integ_approx(0) + + +def test_compare_overlapping(): + low_estimate = data.Estimate.from_triple(1, 0, 3) + test_estimate = data.Estimate(1, 0) + assert 0 < tm.estimate_is_lower(low_estimate, test_estimate) < 0.5 + assert 0.5 < tm.estimate_is_lower(test_estimate, low_estimate) < 1 + almost_low_estimate = data.Estimate.from_triple(1.1, 0.1, 3.1) + assert 0.5 < tm.estimate_is_lower(low_estimate, almost_low_estimate) < 0.6 + low_end_estimate = data.Estimate.from_triple(0.5, 0, 1) + assert 0.1 < tm.estimate_is_lower(low_estimate, low_end_estimate) < 0.2 + rate = tm.estimate_is_lower(low_end_estimate, low_estimate) + assert 0.8 < rate < 0.9 + assert montecarlo_is_lower_test(low_end_estimate, low_estimate) == pytest.approx(rate, abs=0.05) + + +def test_compare_general(): + one = data.Estimate.from_triple(2, 1, 3) + two = data.Estimate.from_triple(1.5, -2, 4) + rate = tm.estimate_is_lower(one, two) + assert montecarlo_is_lower_test(one, two, 4000) == pytest.approx(rate, abs=0.01) + one = data.Estimate.from_triple(2.5, 1, 3) + two = data.Estimate.from_triple(1.5, 1, 4) + rate = tm.estimate_is_lower(one, two) + assert montecarlo_is_lower_test(one, two) == pytest.approx(rate, abs=0.05) + + +def test_compare_weighted(): + assert 0.99 < tm.is_lower([0, 1, 2, 3], [1, 1, 0, 0], [0, 0.01, 1, 1]) < 1 + +# TODO: Comparison of estimates +# TODO: %-complete