[sliq] presorted vectors

zouzias · Oct 20, 2019 · 604b31b · 604b31b
1 parent d5d5a91
commit 604b31b
Show file tree

Hide file tree

Showing 21 changed files with 671 additions and 333 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,7 +68,5 @@ include_directories( ${EIGEN3_INCLUDE_DIRS} )
 add_subdirectory(pybind11)
 
 pybind11_add_module(microgbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h
-        src/trees/treenode.h src/trees/split_info.h
-        src/utils.h
-        src/python_api.cpp)
+        src/trees/treenode.h src/python_api.cpp)
 #########################
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,10 @@
+version: '3'
+
+services:
+  microgbt:
+    image: microgbt:0.0.1
+    build:
+      dockerfile: docker/Dockerfile
+      context: ./
+    volumes:
+      - .:/home/ubuntu
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,3 @@
+FROM zouzias/boost:1.71.0
+
+RUN apt-get -yq update && apt-get -yq install vim cmake python3-pip libeigen3-dev
diff --git a/examples/test-boston.py b/examples/test-boston.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import microgbtpy
 from math import sqrt
 import logging.config

diff --git a/examples/test-flight.py b/examples/test-flight.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import microgbtpy
 import logging.config
 import numpy as np

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_library(microgbt "" metrics/metric.h trees/tree.h trees/split_info.h
+add_library(microgbt "" metrics/metric.h trees/tree.h
         GBT.h dataset.h metrics/logloss.h trees/treenode.h metrics/rmse.h
-        utils.h trees/numerical_splliter.h trees/splitter.h)
+        utils.h trees/class_list.h trees/tree_builder_state.h)
 
 
 target_sources(

diff --git a/src/GBT.h b/src/GBT.h
@@ -2,6 +2,7 @@
 #include <vector>
 #include<iostream>
 #include <memory>
+#include <numeric>
 #include "dataset.h"
 #include "trees/tree.h"
 #include "metrics/metric.h"
@@ -28,17 +29,16 @@ namespace microgbt {
         /**
          * Return a single decision tree given training data, gradient, hession and shrinkage rate
          *
-         * @param trainSet
+         * @param dataset
          * @param gradient
          * @param hessian
          * @param shrinkageRate
          */
-        Tree buildTree(const Dataset &trainSet, const Vector& previousPreds, const Vector &gradient,
-                const Vector &hessian, double shrinkageRate) const {
-
+        Tree buildTree(const Dataset &dataset, const Vector &gradient, const Vector &hessian,
+                double shrinkageRate) const {
 
             Tree tree = Tree(_lambda, _minSplitGain, _minTreeSize, _maxDepth);
-            tree.build(trainSet, previousPreds, gradient, hessian, shrinkageRate);
+            tree.build(dataset, gradient, hessian, shrinkageRate);
             return tree;
         }
 
@@ -128,16 +128,21 @@ namespace microgbt {
                 auto startTimestamp = std::chrono::high_resolution_clock::now();
 
                 // Current predictions
-                Vector scores = predictDataset(trainSet);
+                Vector predictions = predictDataset(trainSet);
 
                 // Compute gradient and Hessian with respect to prior predictions
                 std::cout << "[Computing gradients/Hessians vectors]" << std::endl;
-                Vector gradient = _metric->gradients(scores, trainSet.y());
-                Vector hessian = _metric->hessian(scores);
+                Vector gradient = _metric->gradients(predictions, trainSet.y());
+                Vector hessian = _metric->hessian(predictions);
+
+                std::cout<< "Predictions" << std::endl;
+                for (size_t i = 0; i < gradient.size(); i++){
+                    std::cout << predictions[i] << " - " << trainSet.y()[i] << " / ";
+                }
 
                 // Grow a new tree learner
                 std::cout << "[Building next tree...]" << std::endl;
-                Tree tree = buildTree(trainSet, scores, gradient, hessian, learningRate);
+                Tree tree = buildTree(trainSet, gradient, hessian, learningRate);
                 std::cout << "[Tree is built successfully]" << std::endl;
 
                 // Update the learning rate
@@ -158,7 +163,6 @@ namespace microgbt {
                 std::cout << "[Duration: " << duration << " millis] | [Train Loss]: " << trainLoss
                     << " | [Valid Loss]: " << bestValidationLoss <<std::endl;
 
-
                 // Update best iteration / best validation error
                 if (currentValidationLoss < bestValidationLoss) {
                     bestValidationLoss = currentValidationLoss;

diff --git a/src/dataset.h b/src/dataset.h
@@ -4,14 +4,14 @@
 #include <Eigen/Dense>
 #include <numeric>
 #include <memory>
-#include "trees/split_info.h"
+#include "utils.h"
 
 namespace microgbt {
 
     using Vector = std::vector<double>;
     using VectorT = std::vector<size_t>;
     using MatrixType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
-    using SortedMatrixType = Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
+    using SortedMatrixType = std::vector<Permutation>;
 
     /**
     * Represents a machine learning "design matrix" and target vector, (X, y)
@@ -33,25 +33,23 @@ namespace microgbt {
 
         SortedMatrixType _sortedMatrixIdx;
 
-        VectorT _rowIndices;
-
         /**
          * Return sorted indices from an Eigen vector
          * @param v
          * @return
          */
-         Eigen::VectorXi sortIndices(long colIndex) const{
+         VectorT sortIndices(long colIndex) const{
 
             // initialize original index locations
             Eigen::VectorXd v = col(colIndex);
             unsigned int n = v.size();
 
-            Eigen::VectorXi idx(n);
+            VectorT idx(n);
             // idx contains now 0,1,...,v.size() - 1
-            std::iota(idx.data(), idx.data() + idx.size(), 0);
+            std::iota(idx.begin(), idx.end(), 0);
 
             // sort indexes based on comparing values in v
-            std::sort(idx.data(), idx.data() + idx.size(),
+            std::sort(idx.begin(), idx.end(),
                  [&v](int i1, int i2) {return v[i1] < v[i2];});
 
             return idx;
@@ -62,104 +60,48 @@ namespace microgbt {
         Dataset() = default;
 
         Dataset(const MatrixType& X, const Vector &y):
-        _sortedMatrixIdx(X.rows(), X.cols()),
-        _rowIndices(y.size()){
+        _sortedMatrixIdx(X.cols()) {
             _X = std::make_shared<MatrixType>(X);
             _y = std::make_shared<Vector>(y);
-            // By default, all rows are included in the dataset
-            std::iota(_rowIndices.begin(), _rowIndices.end(), 0);
-
             for ( long j = 0; j < X.cols(); j++) {
-                _sortedMatrixIdx.col(j) = sortIndices(j);
+                _sortedMatrixIdx[j] = Permutation(sortIndices(j));
             }
         }
 
 
         Dataset(Dataset const &dataset) = default;
 
-        /**
-         * Construct a Dataset, given a binary split gain and lef/right side parameter
-         * @param dataset
-         * @param bestGain
-         * @param side
-         */
-        Dataset(Dataset const &dataset, const SplitInfo &bestGain, SplitInfo::Side side):
-                _X(dataset.X()),_y(dataset.yptr()) {
-
-            _X = dataset.X();
-            _y = dataset.yptr();
-
-            VectorT localIds;
-            if (side == SplitInfo::Side::Left) {
-                localIds = bestGain.getLeftLocalIds();
-            } else {
-                localIds = bestGain.getRightLocalIds();
-            }
-
-            _rowIndices = VectorT(localIds.size());
-            VectorT otherRowIndices = dataset.rowIter();
-            for (size_t i = 0 ; i < localIds.size(); i++) {
-                _rowIndices[i] = otherRowIndices[localIds[i]];
-            }
-
-            int rows = _rowIndices.size(), cols = dataset.numFeatures();
-
-            _sortedMatrixIdx = SortedMatrixType(rows, cols);
-
-            #pragma omp parallel for schedule(static)
-            for ( long j = 0; j < cols; j++) {
-                _sortedMatrixIdx.col(j) = sortIndices(j);
-            }
-        }
-
         inline size_t nRows() const {
-            return this->_rowIndices.size();
-        }
-
-        inline VectorT rowIter() const {
-            return _rowIndices;
+            return _X->rows();
         }
 
-        inline size_t numFeatures() const {
+        inline long numFeatures() const {
             return this->_X->cols();
         }
 
         inline std::shared_ptr<MatrixType> X() const {
             return _X;
         }
 
-        inline std::shared_ptr<Vector> yptr() const {
-            return _y;
-        }
-
         inline Vector y() const {
-            Vector proj(_rowIndices.size());
-            for (size_t i = 0; i < proj.size(); i++) {
-                proj[i] = (*_y)[_rowIndices[i]];
-            }
-            return proj;
+            return *_y;
         }
 
         inline Eigen::RowVectorXd row(long rowIndex) const {
-            return _X->row(_rowIndices[rowIndex]);
+            return _X->row(rowIndex);
         }
 
         inline Eigen::RowVectorXd col(long colIndex) const {
-            Eigen::RowVectorXd column(_rowIndices.size());
-            auto fullColumn = _X->col(colIndex);
-            for (size_t i = 0; i < _rowIndices.size(); i++) {
-                column[i] = fullColumn[_rowIndices[i]];
-            }
-            return column;
+           return _X->col(colIndex);
         }
 
         /**
          * Returns a sorted vector of indices corresponding to a column
          * @param colIndex Index of column
          * @return
          */
-        inline Eigen::RowVectorXi sortedColumnIndices(long colIndex) const {
-            return _sortedMatrixIdx.col(colIndex);
+        inline Permutation sortedColumnIndices(long colIndex) const {
+            return _sortedMatrixIdx[colIndex];
         }
     };
 }
diff --git a/src/metrics/logloss.h b/src/metrics/logloss.h
@@ -12,15 +12,14 @@ namespace microgbt {
     /**
      * Log loss metric
      *
-     * Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
+     * Negative Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
      */
     class LogLoss :
             public Metric {
 
     private:
         // Numerical tolerance on boundary of log(x) and log(1-x) function in range [0,1]
         double _eps;
-
     public:
 
         LogLoss() {
@@ -34,10 +33,10 @@ namespace microgbt {
          * @return
          */
         inline double clip(double value) const {
-            if ( value > 1 - _eps )
+            if (value > 1 - _eps)
                 return 1 - _eps;
 
-            if ( value < _eps)
+            if (value < _eps)
                 return _eps;
 
             return value;
@@ -85,4 +84,4 @@ namespace microgbt {
         }
 
     };
-}
+}
diff --git a/src/metrics/rmse.h b/src/metrics/rmse.h
@@ -44,4 +44,4 @@ namespace microgbt {
         }
     };
 
-} // namespace microgbt
+} // namespace microgbt
diff --git a/src/trees/class_list.h b/src/trees/class_list.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <set>
+
+namespace microgbt {
+
+    using NodeId = long;
+
+    /**
+     * ClassList
+     */
+    class ClassList {
+
+        Vector _gradients;
+        Vector _hessians;
+        std::vector<NodeId> _nodeIds;
+
+        // Node index to set of left subtree candidate samples
+        std::map<NodeId, std::set<long>> _leftCandidateSamples;
+
+    public:
+
+        explicit ClassList(const Vector &gradients, const Vector &hessians):
+        _gradients(gradients), _hessians(hessians),
+        _nodeIds(gradients.size()){
+            std::fill(_nodeIds.begin(), _nodeIds.end(), 0);
+        }
+
+        void clean() {
+            _leftCandidateSamples.clear();
+        }
+
+        NodeId nodeAt(long index) const {
+            return _nodeIds[index];
+        }
+
+        void appendSampleToLeftSubTree(NodeId nodeId, long index) {
+            _leftCandidateSamples[nodeId].insert(index);
+        }
+
+        void updateNodeId(long sampleIndex, NodeId newNodeId) {
+            _nodeIds[sampleIndex] = newNodeId;
+        }
+
+        std::set<long> getLeft(NodeId nodeId) {
+            return _leftCandidateSamples[nodeId];
+        }
+
+        long getLeftSize(NodeId nodeId) {
+            return _leftCandidateSamples[nodeId].size();
+        }
+
+        long getRightSize(NodeId nodeId) {
+            return (long)_gradients.size() - (long)_leftCandidateSamples[nodeId].size();
+        }
+
+    };
+} // namespace microgbt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		FROM zouzias/boost:1.71.0

		RUN apt-get -yq update && apt-get -yq install vim cmake python3-pip libeigen3-dev
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,4 +44,4 @@ namespace microgbt { @@
             }
         };
-    } // namespace microgbt
+    } // namespace microgbt