From e2e4119923075975b0174a50e500f84b6ebb0707 Mon Sep 17 00:00:00 2001 From: Anastasios Zouzias Date: Sun, 4 Aug 2019 15:28:12 +0200 Subject: [PATCH] [sliq] presorted vectors --- CMakeLists.txt | 4 +- docker-compose.yml | 10 + docker/Dockerfile | 3 + examples/test-boston.py | 2 +- examples/test-flight.py | 2 +- src/CMakeLists.txt | 4 +- src/GBT.h | 23 +- src/dataset.h | 87 ++------ src/metrics/logloss.h | 9 +- src/metrics/rmse.h | 2 +- src/trees/class_list.h | 58 +++++ src/trees/split_info.h | 92 -------- src/trees/tree.h | 210 ++++++++++++++++-- src/trees/tree_builder_state.h | 42 ++++ src/trees/treenode.h | 375 ++++++++++++++++++++++++--------- src/utils.h | 30 ++- test/CMakeLists.txt | 2 +- test/test_dataset.cpp | 19 -- test/test_split_info.cpp | 9 - test/test_treenode.cpp | 2 +- test/test_utils.cpp | 17 ++ 21 files changed, 671 insertions(+), 331 deletions(-) create mode 100644 docker-compose.yml create mode 100644 docker/Dockerfile create mode 100644 src/trees/class_list.h delete mode 100644 src/trees/split_info.h create mode 100644 src/trees/tree_builder_state.h delete mode 100644 test/test_split_info.cpp create mode 100644 test/test_utils.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b6d5e0..7e68858 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,5 @@ include_directories( ${EIGEN3_INCLUDE_DIRS} ) add_subdirectory(pybind11) pybind11_add_module(microgbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h - src/trees/treenode.h src/trees/split_info.h - src/utils.h - src/python_api.cpp) + src/trees/treenode.h src/python_api.cpp) ######################### diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..37f11d1 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3' + +services: + microgbt: + image: microgbt:0.0.1 + build: + dockerfile: docker/Dockerfile + context: ./ + volumes: + - .:/home/ubuntu diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..7119ab6 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,3 @@ +FROM zouzias/boost:1.71.0 + +RUN apt-get -yq update && apt-get -yq install vim cmake python3-pip libeigen3-dev diff --git a/examples/test-boston.py b/examples/test-boston.py index 402e2ed..9b92e39 100755 --- a/examples/test-boston.py +++ b/examples/test-boston.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import microgbtpy from math import sqrt import logging.config diff --git a/examples/test-flight.py b/examples/test-flight.py index ecdaa2a..c58da56 100755 --- a/examples/test-flight.py +++ b/examples/test-flight.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import microgbtpy import logging.config import numpy as np diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e6e131f..5a3ddf4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ -add_library(microgbt "" metrics/metric.h trees/tree.h trees/split_info.h +add_library(microgbt "" metrics/metric.h trees/tree.h GBT.h dataset.h metrics/logloss.h trees/treenode.h metrics/rmse.h - utils.h trees/numerical_splliter.h trees/splitter.h) + utils.h trees/class_list.h trees/tree_builder_state.h) target_sources( diff --git a/src/GBT.h b/src/GBT.h index b7f3363..b8af918 100644 --- a/src/GBT.h +++ b/src/GBT.h @@ -2,6 +2,7 @@ #include #include #include +#include #include "dataset.h" #include "trees/tree.h" #include "metrics/metric.h" @@ -28,17 +29,16 @@ namespace microgbt { /** * Return a single decision tree given training data, gradient, hession and shrinkage rate * - * @param trainSet + * @param dataset * @param gradient * @param hessian * @param shrinkageRate */ - Tree buildTree(const Dataset &trainSet, const Vector& previousPreds, const Vector &gradient, - const Vector &hessian, double shrinkageRate) const { - + Tree buildTree(const Dataset &dataset, const Vector &gradient, const Vector &hessian, + double shrinkageRate) const { Tree tree = Tree(_lambda, _minSplitGain, _minTreeSize, _maxDepth); - tree.build(trainSet, previousPreds, gradient, hessian, shrinkageRate); + tree.build(dataset, gradient, hessian, shrinkageRate); return tree; } @@ -128,16 +128,21 @@ namespace microgbt { auto startTimestamp = std::chrono::high_resolution_clock::now(); // Current predictions - Vector scores = predictDataset(trainSet); + Vector predictions = predictDataset(trainSet); // Compute gradient and Hessian with respect to prior predictions std::cout << "[Computing gradients/Hessians vectors]" << std::endl; - Vector gradient = _metric->gradients(scores, trainSet.y()); - Vector hessian = _metric->hessian(scores); + Vector gradient = _metric->gradients(predictions, trainSet.y()); + Vector hessian = _metric->hessian(predictions); + + std::cout<< "Predictions" << std::endl; + for (size_t i = 0; i < gradient.size(); i++){ + std::cout << predictions[i] << " - " << trainSet.y()[i] << " / "; + } // Grow a new tree learner std::cout << "[Building next tree...]" << std::endl; - Tree tree = buildTree(trainSet, scores, gradient, hessian, learningRate); + Tree tree = buildTree(trainSet, gradient, hessian, learningRate); std::cout << "[Tree is built successfully]" << std::endl; // Update the learning rate diff --git a/src/dataset.h b/src/dataset.h index 3cb5c10..bcb5d8b 100644 --- a/src/dataset.h +++ b/src/dataset.h @@ -4,14 +4,14 @@ #include #include #include -#include "trees/split_info.h" +#include "utils.h" namespace microgbt { using Vector = std::vector; using VectorT = std::vector; using MatrixType = Eigen::Matrix; - using SortedMatrixType = Eigen::Matrix; + using SortedMatrixType = std::vector; /** * Represents a machine learning "design matrix" and target vector, (X, y) @@ -33,25 +33,23 @@ namespace microgbt { SortedMatrixType _sortedMatrixIdx; - VectorT _rowIndices; - /** * Return sorted indices from an Eigen vector * @param v * @return */ - Eigen::VectorXi sortIndices(long colIndex) const{ + VectorT sortIndices(long colIndex) const{ // initialize original index locations Eigen::VectorXd v = col(colIndex); unsigned int n = v.size(); - Eigen::VectorXi idx(n); + VectorT idx(n); // idx contains now 0,1,...,v.size() - 1 - std::iota(idx.data(), idx.data() + idx.size(), 0); + std::iota(idx.begin(), idx.end(), 0); // sort indexes based on comparing values in v - std::sort(idx.data(), idx.data() + idx.size(), + std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) {return v[i1] < v[i2];}); return idx; @@ -62,64 +60,22 @@ namespace microgbt { Dataset() = default; Dataset(const MatrixType& X, const Vector &y): - _sortedMatrixIdx(X.rows(), X.cols()), - _rowIndices(y.size()){ + _sortedMatrixIdx(X.cols()) { _X = std::make_shared(X); _y = std::make_shared(y); - // By default, all rows are included in the dataset - std::iota(_rowIndices.begin(), _rowIndices.end(), 0); - for ( long j = 0; j < X.cols(); j++) { - _sortedMatrixIdx.col(j) = sortIndices(j); + _sortedMatrixIdx[j] = Permutation(sortIndices(j)); } } Dataset(Dataset const &dataset) = default; - /** - * Construct a Dataset, given a binary split gain and lef/right side parameter - * @param dataset - * @param bestGain - * @param side - */ - Dataset(Dataset const &dataset, const SplitInfo &bestGain, SplitInfo::Side side): - _X(dataset.X()),_y(dataset.yptr()) { - - _X = dataset.X(); - _y = dataset.yptr(); - - VectorT localIds; - if (side == SplitInfo::Side::Left) { - localIds = bestGain.getLeftLocalIds(); - } else { - localIds = bestGain.getRightLocalIds(); - } - - _rowIndices = VectorT(localIds.size()); - VectorT otherRowIndices = dataset.rowIter(); - for (size_t i = 0 ; i < localIds.size(); i++) { - _rowIndices[i] = otherRowIndices[localIds[i]]; - } - - int rows = _rowIndices.size(), cols = dataset.numFeatures(); - - _sortedMatrixIdx = SortedMatrixType(rows, cols); - - for ( long j = 0; j < cols; j++) { - _sortedMatrixIdx.col(j) = sortIndices(j); - } - } - inline size_t nRows() const { - return this->_rowIndices.size(); - } - - inline VectorT rowIter() const { - return _rowIndices; + return _X->rows(); } - inline size_t numFeatures() const { + inline long numFeatures() const { return this->_X->cols(); } @@ -127,29 +83,16 @@ namespace microgbt { return _X; } - inline std::shared_ptr yptr() const { - return _y; - } - inline Vector y() const { - Vector proj(_rowIndices.size()); - for (size_t i = 0; i < proj.size(); i++) { - proj[i] = (*_y)[_rowIndices[i]]; - } - return proj; + return *_y; } inline Eigen::RowVectorXd row(long rowIndex) const { - return _X->row(_rowIndices[rowIndex]); + return _X->row(rowIndex); } inline Eigen::RowVectorXd col(long colIndex) const { - Eigen::RowVectorXd column(_rowIndices.size()); - auto fullColumn = _X->col(colIndex); - for (size_t i = 0; i < _rowIndices.size(); i++) { - column[i] = fullColumn[_rowIndices[i]]; - } - return column; + return _X->col(colIndex); } /** @@ -157,8 +100,8 @@ namespace microgbt { * @param colIndex Index of column * @return */ - inline Eigen::RowVectorXi sortedColumnIndices(long colIndex) const { - return _sortedMatrixIdx.col(colIndex); + inline Permutation sortedColumnIndices(long colIndex) const { + return _sortedMatrixIdx[colIndex]; } }; } diff --git a/src/metrics/logloss.h b/src/metrics/logloss.h index 9e84c3d..f9e13ee 100644 --- a/src/metrics/logloss.h +++ b/src/metrics/logloss.h @@ -12,7 +12,7 @@ namespace microgbt { /** * Log loss metric * - * Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i)) + * Negative Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i)) */ class LogLoss : public Metric { @@ -20,7 +20,6 @@ namespace microgbt { private: // Numerical tolerance on boundary of log(x) and log(1-x) function in range [0,1] double _eps; - public: LogLoss() { @@ -34,10 +33,10 @@ namespace microgbt { * @return */ inline double clip(double value) const { - if ( value > 1 - _eps ) + if (value > 1 - _eps) return 1 - _eps; - if ( value < _eps) + if (value < _eps) return _eps; return value; @@ -85,4 +84,4 @@ namespace microgbt { } }; -} \ No newline at end of file +} diff --git a/src/metrics/rmse.h b/src/metrics/rmse.h index 5fef489..3858ce8 100644 --- a/src/metrics/rmse.h +++ b/src/metrics/rmse.h @@ -44,4 +44,4 @@ namespace microgbt { } }; -} // namespace microgbt \ No newline at end of file +} // namespace microgbt diff --git a/src/trees/class_list.h b/src/trees/class_list.h new file mode 100644 index 0000000..c313a24 --- /dev/null +++ b/src/trees/class_list.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace microgbt { + + using NodeId = long; + + /** + * ClassList + */ + class ClassList { + + Vector _gradients; + Vector _hessians; + std::vector _nodeIds; + + // Node index to set of left subtree candidate samples + std::map> _leftCandidateSamples; + + public: + + explicit ClassList(const Vector &gradients, const Vector &hessians): + _gradients(gradients), _hessians(hessians), + _nodeIds(gradients.size()){ + std::fill(_nodeIds.begin(), _nodeIds.end(), 0); + } + + void clean() { + _leftCandidateSamples.clear(); + } + + NodeId nodeAt(long index) const { + return _nodeIds[index]; + } + + void appendSampleToLeftSubTree(NodeId nodeId, long index) { + _leftCandidateSamples[nodeId].insert(index); + } + + void updateNodeId(long sampleIndex, NodeId newNodeId) { + _nodeIds[sampleIndex] = newNodeId; + } + + std::set getLeft(NodeId nodeId) { + return _leftCandidateSamples[nodeId]; + } + + long getLeftSize(NodeId nodeId) { + return _leftCandidateSamples[nodeId].size(); + } + + long getRightSize(NodeId nodeId) { + return (long)_gradients.size() - (long)_leftCandidateSamples[nodeId].size(); + } + + }; +} // namespace microgbt \ No newline at end of file diff --git a/src/trees/split_info.h b/src/trees/split_info.h deleted file mode 100644 index 18f0ea6..0000000 --- a/src/trees/split_info.h +++ /dev/null @@ -1,92 +0,0 @@ -#pragma once -#include -#include -#include - - -namespace microgbt { - - using VectorD = std::vector; - using VectorT = std::vector; - - /** - * SplitInfo contains information of a binary tree split such as - * gain value, split numeric value on which best split gain is attained. - */ - class SplitInfo { - - private: - - Eigen::RowVectorXi _sortedFeatureIndices; - - /* Best gain and value (!?) */ - double _bestGain = std::numeric_limits::min(), _bestSplitNumericValue = 0.0; - - size_t _bestSortedIndex = -1, _bestFeatureId = -1; - - public: - - enum Side{ - Left, - Right - }; - - explicit SplitInfo() = default; - - SplitInfo(double gain, double bestSplitNumericValue) { - _bestGain = gain; - _bestSplitNumericValue = bestSplitNumericValue; - } - - SplitInfo(Eigen::RowVectorXi sortedFeatureIndices, double gain, double bestSplitNumericValue, size_t bestSortedIdx): - _sortedFeatureIndices(std::move(sortedFeatureIndices)) { - _bestGain = gain; - _bestSplitNumericValue = bestSplitNumericValue; - _bestSortedIndex = bestSortedIdx; - } - - bool operator < (const SplitInfo& rhs) const { return this->_bestGain <= rhs.bestGain(); } - - inline double bestGain() const { - return _bestGain; - } - - inline double splitValue() const { - return _bestSplitNumericValue; - } - - void setBestFeatureId(size_t bestFeatureId) { - _bestFeatureId = bestFeatureId; - } - inline size_t getBestFeatureId() const { - return _bestFeatureId; - } - - VectorT getLeftLocalIds() const { - return VectorT(_sortedFeatureIndices.data(), - _sortedFeatureIndices.data() + _bestSortedIndex); - } - - VectorT getRightLocalIds() const { - return VectorT(_sortedFeatureIndices.data() + _bestSortedIndex, - _sortedFeatureIndices.data() + _sortedFeatureIndices.size()); - } - - VectorD split(const VectorD &vector, const SplitInfo::Side &side) const { - VectorT rowIndices; - if (side == SplitInfo::Side::Left) - rowIndices = getLeftLocalIds(); - else - rowIndices = getRightLocalIds(); - - VectorD splitVector; - std::transform(rowIndices.begin(), rowIndices.end(), - std::back_inserter(splitVector), - [&vector](size_t rowIndex){ - return vector[rowIndex]; - }); - - return splitVector; - } - }; -} // namespace microgbt diff --git a/src/trees/tree.h b/src/trees/tree.h index e352638..4bfdcf3 100644 --- a/src/trees/tree.h +++ b/src/trees/tree.h @@ -3,58 +3,219 @@ #include #include #include +#include #include #include #include #include #include "treenode.h" -#include "split_info.h" #include "../metrics/metric.h" +#include "class_list.h" +#include "tree_builder_state.h" namespace microgbt { + using FeatureId = long; + using NodeId = long; + using GradientHessianPair = std::pair; + + + /** - * A Decision Tree with binary splits + * A binary decision tree that contains information regarding Gradient Boosting a la xgboost (gradients/hessians) */ class Tree { - private: + /** Maximum depth of tree */ int _maxDepth; + long _minTreeSize; + /** Gradient boosting parameters */ - double _lambda, _minSplitGain, _minTreeSize; + double _lambda, _minSplitGain; /** Root of tree */ std::shared_ptr root; + std::vector> nodes; + + /** + * Sort the sample indices for a given feature index 'feature_id'. + * + * It returns sorted indices depending on type of feature (categorical or numeric): + * Categorical feature: performs mean target encoding + * Numerical feature: natural sort on numeric value + * + * @param dataset Input design matrix and targets as Dataset + * @param featureId Feature / column of dataset + */ + static Permutation sortSamplesByFeature(const Dataset &dataset, + int featureId) { + + return dataset.sortedColumnIndices(featureId); + } + public: - Tree(double lambda, double minSplitGain, double minTreeSize, int maxDepth) { + Tree(double lambda, double minSplitGain, size_t minTreeSize, int maxDepth): + nodes(0) { _lambda = lambda; _minSplitGain = minSplitGain; _maxDepth = maxDepth; _minTreeSize = minTreeSize; } + std::shared_ptr newTreeNode(size_t size) { + long nodeId = nodes.size(); + TreeNode node(nodeId, _lambda, size); + node.makeLeaf(); + nodes.push_back(std::make_shared(node)); + return nodes[nodeId]; + } + /** - * Recursively (and greedily) build decision tree using 'optimal' binary splits - * based on gradient & Hessian vectors. + * Build a tree using 'optimal' binary splits + * based on gradient & Hessian vectors (following xgboost's formulation). + * + * The tree building approach is described in SLIQ: A fast scalable classifier for data mining + * (https://doi.org/10.1007/BFb0014141) * - * @param train_set Training dataset - * @param previousPreds Previous iteration predictions - * @param gradient Vector of gradients - * @param hessian Vector of second derivatives, Hessian - * @param shrinkage Shrinkage rate + * @param dataset Training dataset + * @param gradient Vector of gradients to fit on + * @param hessian Vector of second derivatives, Hessians to fit on + * @param shrinkage Shrinkage additive boosting rate */ - void build(const Dataset &trainSet, const Vector &previousPreds, + void build(const Dataset &dataset, const Vector &gradient, const Vector &hessian, double shrinkage) { - this->root = std::unique_ptr(new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth)); - int depth = 0; - this->root->build(trainSet, previousPreds, gradient, hessian, shrinkage, depth); + long nSamples = dataset.nRows(); + // Create the root node + root = newTreeNode(nSamples); + + // SLIQ classlist + ClassList classList(gradient, hessian); + + // Sum the gradients / hessians over all samples + double G = microgbt::sum(gradient); + double H = microgbt::sum(hessian); + + // Root node setup: gradient/hessian/weight + root->setLambda(_lambda); + root->setGradientSum(G); + root->setHessianSum(H); + root->setWeight(root->calc_leaf_weight()); + + // Build tree breadth first search (BFS) as in SLIQ + // That is, go over each depth level of the tree + for (int depth = 0; depth < _maxDepth ; depth ++) { + std::cout << "[Working on depth '" << depth << "']" << std::endl; + + // Go over all features to compute optimal gain + for (FeatureId featureIdx = 0; featureIdx < dataset.numFeatures(); featureIdx++) { + std::cout << "[Working on feature '" << featureIdx << " out of " << dataset.numFeatures() << "']" << std::endl; + + // Instantiate the tree builder state (SLIQ) + TreeBuilderState state(gradient, hessian); + + // Clean the list of candidate left indices per leaf node + classList.clean(); + + Permutation perm = sortSamplesByFeature(dataset, featureIdx); + const Eigen::RowVectorXd featureValues = dataset.col(featureIdx); + + // Go over all pre-sorted sample indices: 'sampleIdx' + for (NodeId i = 0; i < nSamples; i++) { + size_t sampleIdx = perm(i); + double sortedFeatureValue = featureValues[sampleIdx]; + double g = gradient[sampleIdx]; + double h = hessian[sampleIdx]; + NodeId leafId = classList.nodeAt(sampleIdx); + + // Calculate gain on possible split on leafId + // If gain is better than before on leaf “leafId”, mark it + + // Add gradient and Hessian to partial sums + state.addToPartialSums(leafId, g, h); + GradientHessianPair partialSums = state.partialSums(leafId); + double gain = nodes[leafId]->calc_split_gain(partialSums.first, partialSums.second); + + // Assign sample with index sampleIdx to class list + classList.appendSampleToLeftSubTree(leafId, sampleIdx); + + // If gain is better, mark it + if (gain > nodes[leafId]->bestGain() + && gain > 0 + && classList.getLeftSize(leafId) > _minTreeSize + && classList.getRightSize(leafId) > _minTreeSize + ) { + nodes[leafId]->updateWeight(shrinkage); + nodes[leafId]->setLeftSampleIds(classList.getLeft(leafId)); + nodes[leafId]->setBestSplitValue(sortedFeatureValue); + nodes[leafId]->setBestGain(gain); + nodes[leafId]->setBestSplitFeatureId(featureIdx); + nodes[leafId]->setLeftGradientSum(partialSums.first); + nodes[leafId]->setLeftHessianSum(partialSums.second); + } + } + } + + // At this point, leaves nodes contain all information to decide + // if a split will be performed or not + + // Go over all leaves, say “l” + // Create the left and right sub-trees based on leaf info + size_t n = nodes.size(); + for (size_t i = 0; i < n; i++) { + + if (nodes[i]->getSize() < _minTreeSize){ + nodes[i]->updateWeight(shrinkage); + continue; + } + + if (nodes[i]->getLeftSize() < _minTreeSize){ + nodes[i]->updateWeight(shrinkage); + continue; + } + + if (nodes[i]->getRightSize() < _minTreeSize) { + nodes[i]->updateWeight(shrinkage); + continue; + } + + if (nodes[i]->bestGain() < _minSplitGain){ + nodes[i]->updateWeight(shrinkage); + continue; + } + + if (nodes[i]->isLeaf()) { + nodes[i]->setLeftSubTree(newTreeNode(nodes[i]->getLeftSize()), shrinkage); + nodes[i]->setRightSubTree(newTreeNode(nodes[i]->getRightSize()), shrinkage); + nodes[i]->markInnerNode(); + } + } + + // Go over leaves, say “l” + // Update each “previous leaf” with the left or right leaf pointer + for (long i = 0; i < nSamples; i++) { + NodeId leafId = classList.nodeAt(i); + if (!nodes[leafId]->isLeaf()) { + if (nodes[leafId]->isLeftAssigned(i)) { + classList.updateNodeId(i, nodes[leafId]->getLeftSubTreeId()); + } else { + classList.updateNodeId(i, nodes[leafId]->getRightSubTreeId()); + } + } + } + } + + // FIXME: remove those + // std::string output = toDigraph(); + // stdout the tree structure + // std::cout << output; + } /** @@ -66,5 +227,20 @@ namespace microgbt { double score(const Eigen::RowVectorXd &sample) const { return root->score(sample); } + + /** + * Returns the tree represented as directed graph using dot notation + * + * @return + */ + std::string toDigraph() const { + std::stringstream ss; + ss << "digraph {" << std::endl; + for (auto& line: root->toDigraph()) { + ss << "\t" << line << ";" << std::endl; + } + ss << "}" << std::endl; + return ss.str(); + } }; -} // namespace microgbt \ No newline at end of file +} // namespace microgbt diff --git a/src/trees/tree_builder_state.h b/src/trees/tree_builder_state.h new file mode 100644 index 0000000..31551b6 --- /dev/null +++ b/src/trees/tree_builder_state.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +namespace microgbt { + + using NodeId = long; + using Vector = std::vector; + using GradientHessianPair = std::pair; + + /** + * Maintains state (partial gradient / hessian sums) over tree building process + */ + class TreeBuilderState { + std::shared_ptr _gradient; + std::shared_ptr _hessian; + std::map _partialSums; + + + public: + TreeBuilderState(const Vector &gradient, const Vector &hessian) : _gradient(std::make_shared(gradient)), + _hessian(std::make_shared(hessian)), + _partialSums() { + } + + // Get partial Gradient/Hessian sums per tree node + GradientHessianPair partialSums(NodeId nodeId) const { + if (_partialSums.find(nodeId) == _partialSums.end()){ + return GradientHessianPair(0.0, 0.0); + } + else { + return _partialSums.at(nodeId); + } + } + + void addToPartialSums(NodeId nodeId, double g, double h) { + GradientHessianPair sums = partialSums(nodeId); + _partialSums[nodeId] = GradientHessianPair(sums.first + g, sums.second + h); + } + }; +} diff --git a/src/trees/treenode.h b/src/trees/treenode.h index 7acb310..2c559cf 100644 --- a/src/trees/treenode.h +++ b/src/trees/treenode.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -8,11 +9,10 @@ #include #include #include +#include #include "../dataset.h" -#include "split_info.h" -#include "../utils.h" -#include "numerical_splliter.h" +#include "class_list.h" namespace microgbt { @@ -22,32 +22,222 @@ namespace microgbt { */ class TreeNode { private: - int _maxDepth; - double _lambda, _minSplitGain, _minTreeSize; - bool isLeaf = false; - std::unique_ptr leftSubTree; - std::unique_ptr rightSubTree; - long splitFeatureIndex; + NodeId _nodeId; + double _lambda; + std::shared_ptr leftSubTree; + std::shared_ptr rightSubTree; + + // Feature id on which the best split happened on the current node + long _bestSplitFeatureId; /** * Numeric value on which a binary tree split took place */ - double splitNumericValue; - double weight = 0.0; + double _bestSplitNumericValue, _weight = 0; + + // Total gradient and Hessian sum at current node + double _gradientSum, _hessianSum; + + double _bestGain, _leftGradientSum, _leftHessianSum; + + // Is TreeNode a leaf? + bool _isLeaf = true; - template - std::unique_ptr make_unique(Args&&... args) - { - return std::unique_ptr(new T(std::forward(args)...)); + // Number of samples assigned to current node + long _size; + + // Set of sample indices that corresponse to left subtree + std::set _leftSampleIds; + + explicit TreeNode(){ + _nodeId = 0; + _lambda = 0; + _bestGain = std::numeric_limits::min(); + _bestSplitNumericValue = std::numeric_limits::min(); + _bestSplitFeatureId = -1; + _gradientSum = 0.0; + _hessianSum = 0.0; + _leftGradientSum = 0.0; + _leftHessianSum = 0.0; + _weight = 0.0; + _isLeaf = true; + leftSubTree = nullptr; + rightSubTree = nullptr; } public: - TreeNode(double lambda, double minSplitGain, double minTreeSize, int maxDepth){ + TreeNode(long nodeId, double lambda, size_t size){ + _nodeId = nodeId; + _lambda = lambda; + _bestGain = std::numeric_limits::lowest(); + _bestSplitNumericValue = std::numeric_limits::lowest(); + _bestSplitFeatureId = -1; + _gradientSum = 0.0; + _hessianSum = 0.0; + _leftGradientSum = 0.0; + _leftHessianSum = 0.0; + _weight = 0.0; + _size = size; + _isLeaf = true; + leftSubTree = nullptr; + rightSubTree = nullptr; + } + + TreeNode(const TreeNode &treeNode) { + _nodeId = treeNode._nodeId; + _lambda = treeNode._lambda; + _bestGain = treeNode._bestGain; + _bestSplitNumericValue = treeNode._bestSplitNumericValue; + _bestSplitFeatureId = treeNode._bestSplitFeatureId; + _gradientSum = treeNode._gradientSum; + _hessianSum = treeNode._hessianSum; + _leftGradientSum = treeNode._leftGradientSum; + _leftHessianSum = treeNode._leftHessianSum; + leftSubTree = treeNode.leftSubTree; + rightSubTree = treeNode.rightSubTree; + _isLeaf = treeNode._isLeaf; + _size = treeNode._size; + _leftSampleIds = treeNode._leftSampleIds; + _weight = treeNode._weight; + } + + void markInnerNode() { + _isLeaf = false; + } + + void makeLeaf() { + _isLeaf = true; + } + + inline bool isLeaf() const{ + return _isLeaf; + } + + inline double bestGain() const { + return _bestGain; + } + + void setBestGain(double gain) { + _bestGain = gain; + } + + void setLeftSampleIds(const std::set& leftSampleIds){ + _leftSampleIds = leftSampleIds; + } + + bool isLeftAssigned(long sampleId) { + return std::find(_leftSampleIds.begin(), _leftSampleIds.end(), sampleId) != _leftSampleIds.end(); + } + + void setLeftGradientSum(double value) { + _leftGradientSum = value; + } + + void updateWeight(double shrinkage) { + _weight = calc_leaf_weight() * shrinkage; + } + + void setLeftHessianSum(double value) { + _leftHessianSum = value; + } + + void setBestSplitFeatureId(long id) { + _bestSplitFeatureId = id; + } + + void setWeight(double weight) { + _weight = weight; + } + + long getSize() const { + return _size; + } + + void zero() { + _leftSampleIds.clear(); + } + + void setBestSplitValue(double bestSplitValue) { + _bestSplitNumericValue = bestSplitValue; + } + + void setLambda(double lambda) { _lambda = lambda; - _minSplitGain = minSplitGain; - _maxDepth = maxDepth; - _minTreeSize = minTreeSize; + } + + void setGradientSum(double gradientSum) { + _gradientSum = gradientSum; + } + + void setHessianSum(double hessianSum) { + _hessianSum = hessianSum; + } + + inline double getRightGradientSum() const { + return _gradientSum - _leftGradientSum; + } + + inline double getRightHessianSum() const { + return std::max(_hessianSum - _leftHessianSum, 0.0); + } + + inline double getGradientSum() const { + return _gradientSum; + } + + inline double getHessianSum() const { + return _hessianSum; + } + + inline NodeId getLeftSubTreeId() const { + if (leftSubTree == nullptr) { + return -1; + } + return leftSubTree->getNodeId(); + } + + inline NodeId getRightSubTreeId() const { + if (rightSubTree == nullptr) { + return -1; + } + return rightSubTree->getNodeId(); + } + + inline NodeId getNodeId() const { + return _nodeId; + } + + inline long getLeftSize() const { + return _leftSampleIds.size(); + } + + inline long getRightSize() const { + return _size - getLeftSize(); + } + + void setLeftSubTree(std::shared_ptr treeNodePtr, double shrinkage) { + leftSubTree = treeNodePtr; + leftSubTree->setLambda(_lambda); + leftSubTree->setGradientSum(_leftGradientSum); + assert(_leftHessianSum > 0); + leftSubTree->setHessianSum(_leftHessianSum); + leftSubTree->_leftGradientSum = 0.0; + leftSubTree->_leftHessianSum = 0.0; + leftSubTree->makeLeaf(); + leftSubTree->updateWeight(shrinkage); + } + + void setRightSubTree(std::shared_ptr treeNodePtr, double shrinkage) { + rightSubTree = treeNodePtr; + rightSubTree->setLambda(_lambda); + rightSubTree->setGradientSum(getRightGradientSum()); + assert( getRightHessianSum() >= 0); + rightSubTree->setHessianSum(getRightHessianSum()); + rightSubTree->_leftGradientSum = 0.0; + rightSubTree->_leftHessianSum = 0.0; + rightSubTree->makeLeaf(); + rightSubTree->updateWeight(shrinkage); } /** @@ -61,87 +251,43 @@ namespace microgbt { * @param hessian * @return */ - inline double calc_leaf_weight(const Vector &gradient, - const Vector &hessian) const { - return - par_simd_accumulate(gradient) - / (par_simd_accumulate(hessian) + _lambda); + inline double calc_leaf_weight() const { + return - _gradientSum / (_hessianSum + _lambda); } - - /** - * Recursively (and greedily) split a TreeNode based on - * - * Exact Greedy Algorithm for Split Finding: - * 1) For each tree node, enumerate over all features: - * 2) For each feature, sorted the instances by feature numeric value - * 3) Use a linear scan to decide the best split along that feature (if categorical perform Mean Target Encoding) - * 4) Take the best split solution (that maximises gain reduction) over all features - * 5) Recurse on the left and right side of the best split + * Returns objective value for a given gradient, Hessian and lambda value (gradient boosting parameters) * - * (Refer to Algorithm1 of Reference[1]) - * - * @param trainSet - * @param previousPreds * @param gradient * @param hessian - * @param shrinkage - * @param depth + * @param lambd + * @return */ - void build(const Dataset &trainSet, - const Vector &previousPreds, - const Vector &gradient, - const Vector &hessian, - double shrinkage, - int depth) { - - // Check if depth is reached - if (depth > _maxDepth) { - this->isLeaf = true; - this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage; - return; - } - - // Check if # of sample is too small - if ( trainSet.nRows() <= _minTreeSize) { - this->isLeaf = true; - this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage; - return; - } - - // Initialize a numeric splitter and find best split - std::unique_ptr splitter = make_unique(_lambda); - SplitInfo bestGain = splitter->findBestSplit(trainSet, gradient, hessian); - - // Check if best gain is less than minimum split gain (threshold) - if (bestGain.bestGain() < this->_minSplitGain) { - this->isLeaf = true; - this->weight = this->calc_leaf_weight(gradient, hessian) * shrinkage; - return; - } - - this->splitFeatureIndex = bestGain.getBestFeatureId(); - this->splitNumericValue = bestGain.splitValue(); - - - Dataset leftDataset(trainSet, bestGain, SplitInfo::Side::Left); - Vector leftGradient = bestGain.split(gradient, SplitInfo::Side::Left); - Vector leftHessian = bestGain.split(hessian, SplitInfo::Side::Left); - Vector leftPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Left); - this->leftSubTree = std::unique_ptr( - new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth)); - leftSubTree->build(leftDataset, leftPreviousPreds, leftGradient, leftHessian, shrinkage, depth + 1); + constexpr double objective(double gradient, double hessian) const { + return (gradient * gradient) / (hessian + _lambda); + } - Dataset rightDataset(trainSet, bestGain, SplitInfo::Side::Right); - Vector rightGradient = bestGain.split(gradient, SplitInfo::Side::Right); - Vector rightHessian = bestGain.split(hessian, SplitInfo::Side::Right); - Vector rightPreviousPreds = bestGain.split(previousPreds, SplitInfo::Side::Right); - this->rightSubTree = std::unique_ptr( - new TreeNode(_lambda, _minSplitGain, _minTreeSize, _maxDepth)); - rightSubTree->build(rightDataset, rightPreviousPreds, rightGradient, rightHessian, shrinkage, depth + 1); + /** + * Returns gain difference of a specific binary tree split. + * + * Refer to Eq7 of Reference [1] + * + * @param G Gradient on node before the split applied + * @param H Hessian on node before the split applied + * @param G_l Gradient on left split node + * @param H_l Hesssian on left split node + * @param G_r Gradient on right split node + * @param H_r Hesisan on right split node + * @param lambd Regularization xgboost parameter, see Eqn. 7 in [1] + * @return + */ + constexpr double calc_split_gain(double G_l, double H_l) const { + return objective(G_l, H_l) + objective(_gradientSum - G_l, _hessianSum - H_l) + - objective(_gradientSum, _hessianSum) / 2.0; // TODO: minus \gamma } + /** * Return the score for a given sample, i.e. set of features * @@ -149,13 +295,54 @@ namespace microgbt { * @return */ double score(const Eigen::RowVectorXd &sample) const { - if (this->isLeaf) { - return this->weight; - } else if (sample[this->splitFeatureIndex] < this->splitNumericValue) { - return this->leftSubTree->score(sample); + if (_isLeaf) { + return _weight; + } else if (sample[_bestSplitFeatureId] < _bestSplitNumericValue) { + return leftSubTree->score(sample); } else { - return this->rightSubTree->score(sample); + return rightSubTree->score(sample); } } + + /** + * Returns a list of string containing all subtree information in DOT (a graph description language) + * + * See: https://en.wikipedia.org/wiki/DOT_(graph_description_language) + * @return + */ + std::vector toDigraph() const { + std::vector output; + if (isLeaf()) { + return output; + } + + NodeId id = this->getNodeId(); + NodeId left = this->getLeftSubTreeId(); + output.push_back(std::to_string(id) + " [label=\"" + print() + "\"]"); + output.push_back(std::to_string(id) + " -> " + std::to_string(left)); + + NodeId right = this->getRightSubTreeId(); + output.push_back(std::to_string(id) + " -> " + std::to_string(right)); + + + auto leftCollection = leftSubTree->toDigraph(); + auto rightCollection = rightSubTree->toDigraph(); + std::copy(begin(leftCollection), end(leftCollection), std::back_inserter(output)); + std::copy(begin(rightCollection), end(rightCollection), std::back_inserter(output)); + + return output; + } + + std::string print() const { + std::stringstream ss; + ss << " | Node id: " << getNodeId(); + ss << " | Best Gain: " << bestGain(); + ss << " | Weight: " << calc_leaf_weight(); + ss << " | Left size: " << getLeftSize(); + ss << " | S: " << getSize(); + ss << " | G sum: " << getGradientSum(); + ss << " | H sum: " << getHessianSum(); + return ss.str(); + } }; } // namespace microgbt diff --git a/src/utils.h b/src/utils.h index d3b1aca..0ece31b 100644 --- a/src/utils.h +++ b/src/utils.h @@ -6,14 +6,36 @@ namespace microgbt { using Vector = std::vector; - static double par_simd_accumulate(const Vector& vector) { + static double sum(const Vector& vector) { size_t n = vector.size(); double accumulate = 0.0; - #pragma omp simd reduction(+: accumulate) for (size_t i = 0; i < n; i ++){ - accumulate += vector[i]; + accumulate += vector[i]; } return accumulate; } -} // namespace microgbt + + class Permutation { + std::vector _perm, _inverse; + public: + + explicit Permutation() = default; + + explicit Permutation(const std::vector &permVector): + _perm(permVector), + _inverse(_perm.size()){ + for (size_t i = 0 ; i < _perm.size(); i++){ + _inverse[_perm[i]] = i; + } + } + + constexpr size_t operator()(size_t index) const{ + return _perm[index]; + } + + constexpr size_t inverse(size_t index) const{ + return _inverse[index]; + } + }; +} // namespace microgbt \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index dc452bb..5743df8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,12 +1,12 @@ add_executable( unit_tests main.cpp - test_split_info.cpp test_metric_rmse.cpp test_dataset.cpp test_treenode.cpp test_tree.cpp test_gbt.cpp + test_utils.cpp ../src/metrics/metric.h ../src/trees/tree.h ../src/GBT.h ../src/dataset.h ../src/metrics/rmse.h) target_link_libraries( diff --git a/test/test_dataset.cpp b/test/test_dataset.cpp index c4858de..def9257 100644 --- a/test/test_dataset.cpp +++ b/test/test_dataset.cpp @@ -12,22 +12,3 @@ TEST(Dataset, DefaultConstructor) ASSERT_EQ(dataset.nRows(), m); ASSERT_EQ(dataset.numFeatures(), n); } - -TEST(Dataset, Constructor) -{ - - size_t m = 2, n = 3; - Eigen::MatrixXd A = Eigen::MatrixXd::Zero(m, n); - microgbt::Vector y = {1.0, 2.0}; - microgbt::Dataset dataset(A, y); - - std::vector left = {0}; - std::vector right = {1, 2}; - microgbt::SplitInfo splitInfo(dataset.sortedColumnIndices(0), 0.0, 1.0, 1); - - microgbt::Dataset leftDS(dataset, splitInfo, microgbt::SplitInfo::Left); - - ASSERT_EQ(leftDS.nRows(), left.size()); - ASSERT_EQ(leftDS.numFeatures(), n); - -} diff --git a/test/test_split_info.cpp b/test/test_split_info.cpp deleted file mode 100644 index dbc1538..0000000 --- a/test/test_split_info.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include "gtest/gtest.h" -#include "trees/split_info.h" - -TEST(microgbt, SplitInfo) -{ - microgbt::SplitInfo gain(0.0, 1.0); - ASSERT_NEAR(gain.bestGain(), 0.0, 1.0e-11); - ASSERT_NEAR(gain.splitValue(), 1.0, 1.0e-11); -} diff --git a/test/test_treenode.cpp b/test/test_treenode.cpp index d202804..9bc86f1 100644 --- a/test/test_treenode.cpp +++ b/test/test_treenode.cpp @@ -3,6 +3,6 @@ TEST(microgbt, TreeNode) { - microgbt::TreeNode treeNode(1.0, 2.0, 2, 10); + microgbt::TreeNode treeNode(1.0, 2.0, 0); ASSERT_TRUE(true); } diff --git a/test/test_utils.cpp b/test/test_utils.cpp new file mode 100644 index 0000000..165acbb --- /dev/null +++ b/test/test_utils.cpp @@ -0,0 +1,17 @@ +#include +#include "gtest/gtest.h" + +TEST(microgbt, Permutation) +{ + std::vector permVec{1, 2, 0}; + microgbt::Permutation perm(permVec); + + + ASSERT_EQ(perm(0), 1); + ASSERT_EQ(perm(1), 2); + ASSERT_EQ(perm(2), 0); + + ASSERT_EQ(perm.inverse(0), 2); + ASSERT_EQ(perm.inverse(1), 0); + ASSERT_EQ(perm.inverse(2), 1); +}