Revert "switched to correct computation of variance"

mikucionisaau · mikucionisaau · commit 8341ca80206e · 2023-11-01T10:21:56.000+01:00
This reverts commit 1b9e4f1.
diff --git a/src/MLearning.cpp b/src/MLearning.cpp
@@ -242,13 +242,11 @@ namespace prlearn {
         avg_t mean, old_mean;
         std::vector<qvar_t> sample_qvar;
         std::vector<qvar_t> old_var;
-        avg_t svar, ovar;
-
         double fut = 0;
         for (auto& s : _samples) {
             auto best = minimize ? std::numeric_limits<double>::infinity() :
                     -std::numeric_limits<double>::infinity();
-            double squared = 0;
+            double var = 0;
             if (s._size == 0 || s._cloud == 0 || discount == 0) {
                 best = 0;
             } else {
@@ -257,10 +255,10 @@ namespace prlearn {
                     auto c = clouds[s._cloud]._nodes[s._nodes[i]]._q.avg();
                     fut = std::min(fut, c);
                     if (c == best)
-                        squared = std::min(squared, clouds[s._cloud]._nodes[s._nodes[i]]._q.squared());
+                        var = std::min(var, clouds[s._cloud]._nodes[s._nodes[i]]._q._variance);
                     else if ((c < best && minimize) || (c > best && !minimize)) {
                         best = c;
-                        squared = clouds[s._cloud]._nodes[s._nodes[i]]._q.squared();
+                        var = clouds[s._cloud]._nodes[s._nodes[i]]._q._variance;
                     }
                 }
             }
@@ -271,14 +269,14 @@ namespace prlearn {
             best *= discount;
             // dont look too far into the future for the variance.
             // if we do, it will grow in horrible ways and be useless.
-            squared *= std::min(0.5, discount);
+            var *= std::min(0.5, discount);
             for (size_t d = 0; d < dimen; ++d) {
                 if (s._variance) {
                     auto v = s._variance[d];
                     v.first.avg() += best;
                     v.second.avg() += best;
-                    v.first.squared() = std::max(v.first.squared(), squared);
-                    v.second.squared() = std::max(v.second.squared(), squared);
+                    v.first._variance = std::max(v.first._variance, var);
+                    v.second._variance = std::max(v.second._variance, var);
                     tmpq[d].first.addPoints(v.first.cnt(), v.first.avg());
                     tmpq[d].second.addPoints(v.second.cnt(), v.second.avg());
                     mean.addPoints(v.first.cnt(), v.first.avg());
@@ -290,8 +288,8 @@ namespace prlearn {
                     auto v = s._old[d];
                     v.first.avg() += best;
                     v.second.avg() += best;
-                    v.first.squared() = std::max(v.first.squared(), squared);
-                    v.second.squared() = std::max(v.second.squared(), squared);
+                    v.first._variance = std::max(v.first._variance, var);
+                    v.second._variance = std::max(v.second._variance, var);
                     old_mean.addPoints(v.first.cnt(), v.first.avg());
                     old_mean.addPoints(v.second.cnt(), v.second.avg());
                     old_var.push_back(v.first);
@@ -300,28 +298,44 @@ namespace prlearn {
             }
         }
 
-
+        avg_t svar, ovar;
         auto vars = std::make_unique < avg_t[]>(dimen * 2);
         bool first = true;
         size_t dimcnt = 0;
         for (auto& s : sample_qvar) {
+            {
+                const auto dif = std::abs(s.avg() - mean._avg);
+                const auto std = std::sqrt(s._variance);
+                auto var = (std::pow(dif + std, 2.0) + std::pow(dif - std, 2.0)) / 2.0;
+                svar.addPoints(s.cnt(), var);
+            }
             auto id = dimcnt;
+            auto dmin = tmpq[id].first.avg();
             if (!first) {
+                dmin = tmpq[dimcnt].second.avg();
                 id = dimen + dimcnt;
             }
-            vars[id].addPoints(s.cnt(), s.squared());
+            {
+                const auto dif = std::abs(s.avg() - dmin);
+                const auto std = std::sqrt(s._variance);
+                auto var = (std::pow(dif + std, 2.0) + std::pow(dif - std, 2.0)) / 2.0;
+                vars[id].addPoints(s.cnt(), var);
+            }
             if (!first)
                 dimcnt = (dimcnt + 1) % dimen;
             first = !first;
-            svar.addPoints(s.cnt(), s.squared());
         }
 
-        for (auto& s : old_var)
-            ovar.addPoints(s.cnt(), s.squared());
+        for (auto& s : old_var) {
+            const auto dif = std::abs(s.avg() - old_mean._avg);
+            const auto std = std::sqrt(s._variance);
+            auto var = (std::pow(dif + std, 2.0) + std::pow(dif - std, 2.0)) / 2.0;
+            ovar.addPoints(s.cnt(), var);
+        }
 
         for (size_t i = 0; i < dimen; ++i) {
-            tmpq[i].first.squared() = vars[i]._avg;
-            tmpq[i].second.squared() = vars[i + dimen]._avg;
+            tmpq[i].first._variance = vars[i]._avg;
+            tmpq[i].second._variance = vars[i + dimen]._avg;
         }
 
         qvar_t nq(mean._avg, mean._cnt / (dimen * 2), svar._avg);
diff --git a/src/RefinementTree.cpp b/src/RefinementTree.cpp
@@ -69,7 +69,7 @@ namespace prlearn {
             return qvar_t(std::numeric_limits<double>::quiet_NaN(), 0, 0);
         auto n = _nodes[res->_nid].get_leaf(point, res->_nid, _nodes);
         auto& node = _nodes[n];
-        return qvar_t(node._predictor._q.avg(), node._predictor._cnt, node._predictor._q.squared());
+        return qvar_t(node._predictor._q.avg(), node._predictor._cnt, node._predictor._q._variance);
     }
 
     double RefinementTree::getBestQ(const double* point, bool minimization, size_t* next_labels, size_t n_labels) const {
@@ -231,12 +231,12 @@ namespace prlearn {
                 if (nodes[slow]._predictor._q.cnt() == 0) {
                     nodes[slow]._predictor._q.cnt() = 1;
                     nodes[slow]._predictor._q.avg() = oq.avg();
-                    nodes[slow]._predictor._q.squared() = std::pow(oq.avg(), 2.0);
+                    nodes[slow]._predictor._q._variance = 0;
                 }
                 if (nodes[shigh]._predictor._q.cnt() == 0) {
                     nodes[shigh]._predictor._q.cnt() = 1;
                     nodes[shigh]._predictor._q.avg() = oq.avg();
-                    nodes[shigh]._predictor._q.squared() = std::pow(oq.avg(), 2.0);
+                    nodes[shigh]._predictor._q._variance = 0;
                 }
             }
             nodes[shigh]._predictor._cnt = nodes[shigh]._predictor._q.cnt();
diff --git a/src/SimpleMLearning.cpp b/src/SimpleMLearning.cpp
@@ -110,14 +110,14 @@ namespace prlearn {
             for(auto& s : n._succssors)
             {
                 const auto dif = std::abs(s._cost.avg() - nq._avg);
-                const auto std = std::sqrt(s._cost.variance());
+                const auto std = std::sqrt(s._cost._variance);
                 auto var = (std::pow(dif + std, 2.0) + std::pow(dif - std, 2.0)) / 2.0;
                 nv.addPoints(s._cost.cnt(), var);
             }
             n._q = qvar_t(nq._avg, nq._cnt, nv._avg);
             if ((minimization && n._q.avg() <= rq.avg()) ||
                     (!minimization && n._q.avg() >= rq.avg())) {
-                if(n._q.avg() != rq.avg() || n._q.variance() < rq.variance() || n._q.cnt() > rq.cnt())
+                if(n._q.avg() != rq.avg() || n._q._variance < rq._variance || n._q.cnt() > rq.cnt())
                     rq = n._q;
             }
         }
diff --git a/src/SimpleRegressor.h b/src/SimpleRegressor.h
@@ -47,7 +47,7 @@ namespace prlearn {
             auto res = std::lower_bound(std::begin(_labels), std::end(_labels), lf);
 
             if (res != std::end(_labels) && res->_label == label)
-                return qvar_t{res->_value.avg(), (double)res->_cnt, res->_value.squared()};
+                return qvar_t{res->_value.avg(), (double)res->_cnt, res->_value._variance};
             else
                 return qvar_t{std::numeric_limits<double>::quiet_NaN(), 0, 0};
         }
diff --git a/src/structs.cpp b/src/structs.cpp
@@ -1,21 +1,21 @@
 /*
  * Copyright Peter G. Jensen
- *
+ *  
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- *
+ * 
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
+ * 
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/*
+/* 
  * File:   structs.cpp
  * Author: Peter G. Jensen
  *
@@ -44,7 +44,7 @@ namespace prlearn {
     void qvar_t::print(std::ostream& stream) const {
         stream << "[";
         stream << (*(avg_t*)this);
-        stream << ", " << variance() << "]";
+        stream << ", " << _variance << "]";
     }
 
     std::ostream& operator<<(std::ostream& o, const qvar_t& v) {
@@ -59,25 +59,48 @@ namespace prlearn {
             return a;
         qvar_t res = a;
         res.addPoints(b._cnt, b._avg);
-        res._sq = (a._sq * (a._cnt / res._cnt)) + (b._sq * (b._cnt / res._cnt));
+        const auto adif = std::abs(res._avg - a._avg);
+        const auto bdif = std::abs(res._avg - b._avg);
+        const auto astd = std::sqrt(a._variance);
+        const auto bstd = std::sqrt(b._variance);
+        auto ca = std::pow(adif + astd, 2.0) + std::pow(adif - astd, 2.0);
+        auto cb = std::pow(bdif + bstd, 2.0) + std::pow(bdif - bstd, 2.0);
+        avg_t tmp;
+        tmp.addPoints(a._cnt, ca / 2.0);
+        tmp.addPoints(b._cnt, cb / 2.0);
+        res._variance = tmp._avg;
         return res;
     }
 
     qvar_t& qvar_t::operator+=(double d) {
         assert(!std::isinf(d));
         avg_t::operator+=(d);
-        auto diff = std::pow(d, 2.0) - _sq;
-        _sq += diff / _cnt;
+        auto nvar = std::pow(d - _avg, 2.0);
+        assert(!std::isinf(nvar));
+        if (_cnt == 1) _variance = nvar;
+        else {
+            nvar -= _variance;
+            _variance += nvar / _cnt;
+        }
         return *this;
     }
 
     void qvar_t::addPoints(double weight, double d) {
         assert(weight >= 0);
         assert(_cnt >= 0);
         if (weight == 0) return;
+        auto oa = _avg;
         avg_t::addPoints(weight, d);
-        auto diff = std::pow(d, 2.0) - _sq;
-        _sq += diff * (weight / _cnt);
+        auto nvar = std::abs((d - oa)*(d - _avg));
+        assert(!std::isinf(nvar));
+        if (_cnt == weight) _variance = nvar;
+        else {
+            nvar -= _variance;
+            _variance += (nvar * weight) / _cnt;
+        }
+        assert(_variance >= 0);
+        assert(!std::isnan(_variance));
+        assert(!std::isinf(_variance));
     }
 
     double triangular_cdf(double mid, double width, double point) {
@@ -94,10 +117,10 @@ namespace prlearn {
         constexpr double minvar = 0.0001;
         if (std::min(a.cnt(), b.cnt()) <= 1)
             return;
-        if (a.variance() == b.variance() && a.avg() == b.avg())
+        if (a._variance == b._variance && a.avg() == b.avg())
             return;
-        auto vara = std::max(minvar, a.variance());
-        auto varb = std::max(minvar, b.variance());
+        auto vara = std::max(minvar, a._variance);
+        auto varb = std::max(minvar, b._variance);
 
         double tval = std::abs(a.avg() - b.avg()) / std::sqrt(((vara * a.cnt()) + (varb * b.cnt())) / (a.cnt() * b.cnt()));
 
diff --git a/src/structs.h b/src/structs.h
@@ -1,21 +1,21 @@
 /*
  * Copyright Peter G. Jensen
- *
+ *  
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
  * (at your option) any later version.
- *
+ * 
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
+ * 
  * You should have received a copy of the GNU Lesser General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-/*
+/* 
  * File:   structs.h
  * Author: Peter G. Jensen
  *
@@ -33,8 +33,6 @@
 #include <cassert>
 #include <vector>
 #include <ostream>
-#include <iostream>
-
 namespace prlearn {
 
     struct avg_t {
@@ -56,7 +54,7 @@ namespace prlearn {
             } else {
                 _cnt += weight;
                 double diff = d - _avg;
-                _avg += diff * (weight / _cnt); // add only "share" of difference
+                _avg += ((diff * weight) / (double) _cnt); // add only "share" of difference
             }
             assert(!std::isnan(_avg));
         }
@@ -98,14 +96,15 @@ namespace prlearn {
 
         qvar_t() = default;
 
-        qvar_t(double d, double w, double squared) {
+        qvar_t(double d, double w, double v) {
             _avg = d;
             _cnt = w;
-            _sq = squared;
+            _variance = v;
         };
         // this is a dirty hijack!
         qvar_t& operator+=(double d);
         void addPoints(double weight, double d);
+        double _variance = 0;
 
         auto& avg() {
             return _avg;
@@ -128,24 +127,6 @@ namespace prlearn {
         }
         void print(std::ostream& stream) const;
         static qvar_t approximate(const qvar_t& a, const qvar_t& b);
-        double variance() const {
-            auto pow = std::pow(_avg, 2.0);
-            if(pow >= _sq)
-                return 0;
-            auto var = std::sqrt(_sq - pow);
-            return var;
-        }
-
-        double& squared() {
-            return _sq;
-        }
-
-        const double& squared() const {
-            return _sq;
-        }
-
-    private:
-        double _sq = 0;
     };
 
     struct splitfilter_t {

Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ namespace prlearn {`
`69`	`69`	`return qvar_t(std::numeric_limits<double>::quiet_NaN(), 0, 0);`
`70`	`70`	`auto n = _nodes[res->_nid].get_leaf(point, res->_nid, _nodes);`
`71`	`71`	`auto& node = _nodes[n];`
`72`		`- return qvar_t(node._predictor._q.avg(), node._predictor._cnt, node._predictor._q.squared());`
	`72`	`+ return qvar_t(node._predictor._q.avg(), node._predictor._cnt, node._predictor._q._variance);`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`double RefinementTree::getBestQ(const double* point, bool minimization, size_t* next_labels, size_t n_labels) const {`
`@@ -231,12 +231,12 @@ namespace prlearn {`
`231`	`231`	`if (nodes[slow]._predictor._q.cnt() == 0) {`
`232`	`232`	`nodes[slow]._predictor._q.cnt() = 1;`
`233`	`233`	`nodes[slow]._predictor._q.avg() = oq.avg();`
`234`		`- nodes[slow]._predictor._q.squared() = std::pow(oq.avg(), 2.0);`
	`234`	`+ nodes[slow]._predictor._q._variance = 0;`
`235`	`235`	`}`
`236`	`236`	`if (nodes[shigh]._predictor._q.cnt() == 0) {`
`237`	`237`	`nodes[shigh]._predictor._q.cnt() = 1;`
`238`	`238`	`nodes[shigh]._predictor._q.avg() = oq.avg();`
`239`		`- nodes[shigh]._predictor._q.squared() = std::pow(oq.avg(), 2.0);`
	`239`	`+ nodes[shigh]._predictor._q._variance = 0;`
`240`	`240`	`}`
`241`	`241`	`}`
`242`	`242`	`nodes[shigh]._predictor._cnt = nodes[shigh]._predictor._q.cnt();`
Original file line number	Diff line number	Diff line change
`@@ -110,14 +110,14 @@ namespace prlearn {`
`110`	`110`	`for(auto& s : n._succssors)`
`111`	`111`	`{`
`112`	`112`	`const auto dif = std::abs(s._cost.avg() - nq._avg);`
`113`		`- const auto std = std::sqrt(s._cost.variance());`
	`113`	`+ const auto std = std::sqrt(s._cost._variance);`
`114`	`114`	`auto var = (std::pow(dif + std, 2.0) + std::pow(dif - std, 2.0)) / 2.0;`
`115`	`115`	`nv.addPoints(s._cost.cnt(), var);`
`116`	`116`	`}`
`117`	`117`	`n._q = qvar_t(nq._avg, nq._cnt, nv._avg);`
`118`	`118`	`if ((minimization && n._q.avg() <= rq.avg()) \|\|`
`119`	`119`	`(!minimization && n._q.avg() >= rq.avg())) {`
`120`		`- if(n._q.avg() != rq.avg() \|\| n._q.variance() < rq.variance() \|\| n._q.cnt() > rq.cnt())`
	`120`	`+ if(n._q.avg() != rq.avg() \|\| n._q._variance < rq._variance \|\| n._q.cnt() > rq.cnt())`
`121`	`121`	`rq = n._q;`
`122`	`122`	`}`
`123`	`123`	`}`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ namespace prlearn {`
`47`	`47`	`auto res = std::lower_bound(std::begin(_labels), std::end(_labels), lf);`
`48`	`48`
`49`	`49`	`if (res != std::end(_labels) && res->_label == label)`
`50`		`- return qvar_t{res->_value.avg(), (double)res->_cnt, res->_value.squared()};`
	`50`	`+ return qvar_t{res->_value.avg(), (double)res->_cnt, res->_value._variance};`
`51`	`51`	`else`
`52`	`52`	`return qvar_t{std::numeric_limits<double>::quiet_NaN(), 0, 0};`
`53`	`53`	`}`