Skip to content

Commit

Permalink
[sliq] presorted vectors
Browse files Browse the repository at this point in the history
  • Loading branch information
Anastasios Zouzias committed Oct 20, 2019
1 parent d5d5a91 commit 604b31b
Show file tree
Hide file tree
Showing 21 changed files with 671 additions and 333 deletions.
4 changes: 1 addition & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,5 @@ include_directories( ${EIGEN3_INCLUDE_DIRS} )
add_subdirectory(pybind11)

pybind11_add_module(microgbtpy src/metrics/metric.h src/trees/tree.h src/GBT.h src/dataset.h src/metrics/logloss.h
src/trees/treenode.h src/trees/split_info.h
src/utils.h
src/python_api.cpp)
src/trees/treenode.h src/python_api.cpp)
#########################
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: '3'

services:
microgbt:
image: microgbt:0.0.1
build:
dockerfile: docker/Dockerfile
context: ./
volumes:
- .:/home/ubuntu
3 changes: 3 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FROM zouzias/boost:1.71.0

RUN apt-get -yq update && apt-get -yq install vim cmake python3-pip libeigen3-dev
2 changes: 1 addition & 1 deletion examples/test-boston.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import microgbtpy
from math import sqrt
import logging.config
Expand Down
2 changes: 1 addition & 1 deletion examples/test-flight.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import microgbtpy
import logging.config
import numpy as np
Expand Down
4 changes: 2 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
add_library(microgbt "" metrics/metric.h trees/tree.h trees/split_info.h
add_library(microgbt "" metrics/metric.h trees/tree.h
GBT.h dataset.h metrics/logloss.h trees/treenode.h metrics/rmse.h
utils.h trees/numerical_splliter.h trees/splitter.h)
utils.h trees/class_list.h trees/tree_builder_state.h)


target_sources(
Expand Down
24 changes: 14 additions & 10 deletions src/GBT.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <vector>
#include<iostream>
#include <memory>
#include <numeric>
#include "dataset.h"
#include "trees/tree.h"
#include "metrics/metric.h"
Expand All @@ -28,17 +29,16 @@ namespace microgbt {
/**
* Return a single decision tree given training data, gradient, hession and shrinkage rate
*
* @param trainSet
* @param dataset
* @param gradient
* @param hessian
* @param shrinkageRate
*/
Tree buildTree(const Dataset &trainSet, const Vector& previousPreds, const Vector &gradient,
const Vector &hessian, double shrinkageRate) const {

Tree buildTree(const Dataset &dataset, const Vector &gradient, const Vector &hessian,
double shrinkageRate) const {

Tree tree = Tree(_lambda, _minSplitGain, _minTreeSize, _maxDepth);
tree.build(trainSet, previousPreds, gradient, hessian, shrinkageRate);
tree.build(dataset, gradient, hessian, shrinkageRate);
return tree;
}

Expand Down Expand Up @@ -128,16 +128,21 @@ namespace microgbt {
auto startTimestamp = std::chrono::high_resolution_clock::now();

// Current predictions
Vector scores = predictDataset(trainSet);
Vector predictions = predictDataset(trainSet);

// Compute gradient and Hessian with respect to prior predictions
std::cout << "[Computing gradients/Hessians vectors]" << std::endl;
Vector gradient = _metric->gradients(scores, trainSet.y());
Vector hessian = _metric->hessian(scores);
Vector gradient = _metric->gradients(predictions, trainSet.y());
Vector hessian = _metric->hessian(predictions);

std::cout<< "Predictions" << std::endl;
for (size_t i = 0; i < gradient.size(); i++){
std::cout << predictions[i] << " - " << trainSet.y()[i] << " / ";
}

// Grow a new tree learner
std::cout << "[Building next tree...]" << std::endl;
Tree tree = buildTree(trainSet, scores, gradient, hessian, learningRate);
Tree tree = buildTree(trainSet, gradient, hessian, learningRate);
std::cout << "[Tree is built successfully]" << std::endl;

// Update the learning rate
Expand All @@ -158,7 +163,6 @@ namespace microgbt {
std::cout << "[Duration: " << duration << " millis] | [Train Loss]: " << trainLoss
<< " | [Valid Loss]: " << bestValidationLoss <<std::endl;


// Update best iteration / best validation error
if (currentValidationLoss < bestValidationLoss) {
bestValidationLoss = currentValidationLoss;
Expand Down
88 changes: 15 additions & 73 deletions src/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
#include <Eigen/Dense>
#include <numeric>
#include <memory>
#include "trees/split_info.h"
#include "utils.h"

namespace microgbt {

using Vector = std::vector<double>;
using VectorT = std::vector<size_t>;
using MatrixType = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
using SortedMatrixType = Eigen::Matrix<int, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
using SortedMatrixType = std::vector<Permutation>;

/**
* Represents a machine learning "design matrix" and target vector, (X, y)
Expand All @@ -33,25 +33,23 @@ namespace microgbt {

SortedMatrixType _sortedMatrixIdx;

VectorT _rowIndices;

/**
* Return sorted indices from an Eigen vector
* @param v
* @return
*/
Eigen::VectorXi sortIndices(long colIndex) const{
VectorT sortIndices(long colIndex) const{

// initialize original index locations
Eigen::VectorXd v = col(colIndex);
unsigned int n = v.size();

Eigen::VectorXi idx(n);
VectorT idx(n);
// idx contains now 0,1,...,v.size() - 1
std::iota(idx.data(), idx.data() + idx.size(), 0);
std::iota(idx.begin(), idx.end(), 0);

// sort indexes based on comparing values in v
std::sort(idx.data(), idx.data() + idx.size(),
std::sort(idx.begin(), idx.end(),
[&v](int i1, int i2) {return v[i1] < v[i2];});

return idx;
Expand All @@ -62,104 +60,48 @@ namespace microgbt {
Dataset() = default;

Dataset(const MatrixType& X, const Vector &y):
_sortedMatrixIdx(X.rows(), X.cols()),
_rowIndices(y.size()){
_sortedMatrixIdx(X.cols()) {
_X = std::make_shared<MatrixType>(X);
_y = std::make_shared<Vector>(y);
// By default, all rows are included in the dataset
std::iota(_rowIndices.begin(), _rowIndices.end(), 0);

for ( long j = 0; j < X.cols(); j++) {
_sortedMatrixIdx.col(j) = sortIndices(j);
_sortedMatrixIdx[j] = Permutation(sortIndices(j));
}
}


Dataset(Dataset const &dataset) = default;

/**
* Construct a Dataset, given a binary split gain and lef/right side parameter
* @param dataset
* @param bestGain
* @param side
*/
Dataset(Dataset const &dataset, const SplitInfo &bestGain, SplitInfo::Side side):
_X(dataset.X()),_y(dataset.yptr()) {

_X = dataset.X();
_y = dataset.yptr();

VectorT localIds;
if (side == SplitInfo::Side::Left) {
localIds = bestGain.getLeftLocalIds();
} else {
localIds = bestGain.getRightLocalIds();
}

_rowIndices = VectorT(localIds.size());
VectorT otherRowIndices = dataset.rowIter();
for (size_t i = 0 ; i < localIds.size(); i++) {
_rowIndices[i] = otherRowIndices[localIds[i]];
}

int rows = _rowIndices.size(), cols = dataset.numFeatures();

_sortedMatrixIdx = SortedMatrixType(rows, cols);

#pragma omp parallel for schedule(static)
for ( long j = 0; j < cols; j++) {
_sortedMatrixIdx.col(j) = sortIndices(j);
}
}

inline size_t nRows() const {
return this->_rowIndices.size();
}

inline VectorT rowIter() const {
return _rowIndices;
return _X->rows();
}

inline size_t numFeatures() const {
inline long numFeatures() const {
return this->_X->cols();
}

inline std::shared_ptr<MatrixType> X() const {
return _X;
}

inline std::shared_ptr<Vector> yptr() const {
return _y;
}

inline Vector y() const {
Vector proj(_rowIndices.size());
for (size_t i = 0; i < proj.size(); i++) {
proj[i] = (*_y)[_rowIndices[i]];
}
return proj;
return *_y;
}

inline Eigen::RowVectorXd row(long rowIndex) const {
return _X->row(_rowIndices[rowIndex]);
return _X->row(rowIndex);
}

inline Eigen::RowVectorXd col(long colIndex) const {
Eigen::RowVectorXd column(_rowIndices.size());
auto fullColumn = _X->col(colIndex);
for (size_t i = 0; i < _rowIndices.size(); i++) {
column[i] = fullColumn[_rowIndices[i]];
}
return column;
return _X->col(colIndex);
}

/**
* Returns a sorted vector of indices corresponding to a column
* @param colIndex Index of column
* @return
*/
inline Eigen::RowVectorXi sortedColumnIndices(long colIndex) const {
return _sortedMatrixIdx.col(colIndex);
inline Permutation sortedColumnIndices(long colIndex) const {
return _sortedMatrixIdx[colIndex];
}
};
}
9 changes: 4 additions & 5 deletions src/metrics/logloss.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@ namespace microgbt {
/**
* Log loss metric
*
* Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
* Negative Logistic loss: y_i ln(1 + exp(-pred_i)) + (1-y_i) ln( 1 + exp(pred_i))
*/
class LogLoss :
public Metric {

private:
// Numerical tolerance on boundary of log(x) and log(1-x) function in range [0,1]
double _eps;

public:

LogLoss() {
Expand All @@ -34,10 +33,10 @@ namespace microgbt {
* @return
*/
inline double clip(double value) const {
if ( value > 1 - _eps )
if (value > 1 - _eps)
return 1 - _eps;

if ( value < _eps)
if (value < _eps)
return _eps;

return value;
Expand Down Expand Up @@ -85,4 +84,4 @@ namespace microgbt {
}

};
}
}
2 changes: 1 addition & 1 deletion src/metrics/rmse.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ namespace microgbt {
}
};

} // namespace microgbt
} // namespace microgbt
58 changes: 58 additions & 0 deletions src/trees/class_list.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#pragma once

#include <set>

namespace microgbt {

using NodeId = long;

/**
* ClassList
*/
class ClassList {

Vector _gradients;
Vector _hessians;
std::vector<NodeId> _nodeIds;

// Node index to set of left subtree candidate samples
std::map<NodeId, std::set<long>> _leftCandidateSamples;

public:

explicit ClassList(const Vector &gradients, const Vector &hessians):
_gradients(gradients), _hessians(hessians),
_nodeIds(gradients.size()){
std::fill(_nodeIds.begin(), _nodeIds.end(), 0);
}

void clean() {
_leftCandidateSamples.clear();
}

NodeId nodeAt(long index) const {
return _nodeIds[index];
}

void appendSampleToLeftSubTree(NodeId nodeId, long index) {
_leftCandidateSamples[nodeId].insert(index);
}

void updateNodeId(long sampleIndex, NodeId newNodeId) {
_nodeIds[sampleIndex] = newNodeId;
}

std::set<long> getLeft(NodeId nodeId) {
return _leftCandidateSamples[nodeId];
}

long getLeftSize(NodeId nodeId) {
return _leftCandidateSamples[nodeId].size();
}

long getRightSize(NodeId nodeId) {
return (long)_gradients.size() - (long)_leftCandidateSamples[nodeId].size();
}

};
} // namespace microgbt
Loading

0 comments on commit 604b31b

Please sign in to comment.