From 43631b873d973092818d7e5e540265e3fe08e986 Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Fri, 14 Feb 2025 15:21:48 -0800 Subject: [PATCH 1/9] add position_column support --- R-package/R/aliases.R | 1 + R-package/tests/testthat/test_lgb.Booster.R | 1 + docs/Parameters.rst | 12 ++ include/LightGBM/c_api.h | 6 + include/LightGBM/config.h | 9 ++ include/LightGBM/dataset.h | 50 ++++--- include/LightGBM/dataset_loader.h | 2 + python-package/lightgbm/basic.py | 2 + src/c_api.cpp | 14 +- src/io/config_auto.cpp | 8 ++ src/io/dataset.cpp | 2 +- src/io/dataset_loader.cpp | 40 +++++- src/io/metadata.cpp | 139 ++++++++++++++------ src/objective/rank_objective.hpp | 8 +- tests/cpp_tests/test_stream.cpp | 42 ++++-- tests/cpp_tests/testutils.cpp | 69 ++++++++-- tests/cpp_tests/testutils.h | 19 ++- tests/python_package_test/test_engine.py | 3 + 18 files changed, 325 insertions(+), 102 deletions(-) diff --git a/R-package/R/aliases.R b/R-package/R/aliases.R index d381b70c5b8e..af2bb907884b 100644 --- a/R-package/R/aliases.R +++ b/R-package/R/aliases.R @@ -29,6 +29,7 @@ , "two_round" , "use_missing" , "weight_column" + , "position_column" , "zero_as_missing" )]) } diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R index 9197fd7226af..d2e8aa6e1323 100644 --- a/R-package/tests/testthat/test_lgb.Booster.R +++ b/R-package/tests/testthat/test_lgb.Booster.R @@ -1076,6 +1076,7 @@ test_that("all parameters are stored correctly with save_model_to_string()", { , "[label_column: ]" , "[weight_column: ]" , "[group_column: ]" + , "[position_column: ]" , "[ignore_column: ]" , "[categorical_feature: ]" , "[forcedbins_filename: ]" diff --git a/docs/Parameters.rst b/docs/Parameters.rst index b44d90ecec10..a355204afe05 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -925,6 +925,18 @@ Dataset Parameters - **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0`` + - ``position_column`` :raw-html:`🔗︎`, default = ``""``, type = int or string, aliases: ``position``, ``position_id`` + + - used to specify the position id column + + - use number for index, e.g. ``position=0`` means column\_0 is the position + + - add a prefix ``name:`` for column name, e.g. ``position=name:position_id`` + + - **Note**: works only in case of loading data directly from text file + + - **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and position\_id is column\_1, the correct parameter is ``position=0`` + - ``ignore_column`` :raw-html:`🔗︎`, default = ``""``, type = multi-int or string, aliases: ``ignore_feature``, ``blacklist`` - used to specify some ignoring columns in training diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 8f8137df1f48..5d2455cf5d94 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -169,6 +169,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateByReference(const DatasetHandle referenc * \param has_weights Whether the dataset has Metadata weights * \param has_init_scores Whether the dataset has Metadata initial scores * \param has_queries Whether the dataset has Metadata queries/groups + * \param has_positions Whether the dataset has Metadata positions/groups * \param nclasses Number of initial score classes * \param nthreads Number of external threads that will use the PushRows APIs * \param omp_max_threads Maximum number of OpenMP threads (-1 for default) @@ -178,6 +179,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, + int32_t has_positions, int32_t nclasses, int32_t nthreads, int32_t omp_max_threads); @@ -233,6 +235,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset, * \param weight Optional pointer to array with nrow weights * \param init_score Optional pointer to array with nrow*nclasses initial scores, in column format * \param query Optional pointer to array with nrow query values + * \param position Optional pointer to array with nrow position values * \param tid The id of the calling thread, from 0...N-1 threads * \return 0 when succeed, -1 when failure happens */ @@ -246,6 +249,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, const float* weight, const double* init_score, const int32_t* query, + const int32_t* position, int32_t tid); /*! @@ -288,6 +292,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset, * \param weight Optional pointer to array with nindptr-1 weights * \param init_score Optional pointer to array with (nindptr-1)*nclasses initial scores, in column format * \param query Optional pointer to array with nindptr-1 query values + * \param position Optional pointer to array with nindptr-1 position values * \param tid The id of the calling thread, from 0...N-1 threads * \return 0 when succeed, -1 when failure happens */ @@ -304,6 +309,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle datase const float* weight, const double* init_score, const int32_t* query, + const int32_t* position, int32_t tid); /*! diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d5b56f0fd1fb..d7cd41ad5142 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -766,6 +766,15 @@ struct Config { // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0`` std::string group_column = ""; + // type = int or string + // alias = position, position_id, position_column + // desc = used to specify the position/position id column + // desc = use number for index, e.g. ``position=0`` means column\_0 is the position id + // desc = add a prefix ``name:`` for column name, e.g. ``position=name:position_id`` + // desc = **Note**: works only in case of loading data directly from text file + // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and position\_id is column\_1, the correct parameter is ``position=0`` + std::string position_column = ""; + // type = multi-int or string // alias = ignore_feature, blacklist // desc = used to specify some ignoring columns in training diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c2a4b62296f2..8ef36a573a48 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -76,8 +76,9 @@ class Metadata { * \param num_data Number of training data * \param weight_idx Index of weight column, < 0 means doesn't exists * \param query_idx Index of query id column, < 0 means doesn't exists + * \param position_idx Index of position id column, < 0 means doesn't exists */ - void Init(data_size_t num_data, int weight_idx, int query_idx); + void Init(data_size_t num_data, int weight_idx, int query_idx, int position_idx); /*! * \brief Allocate space for label, weight (if exists), initial score (if exists) and query (if exists) @@ -92,9 +93,10 @@ class Metadata { * \param has_weights Whether the metadata has weights * \param has_init_scores Whether the metadata has initial scores * \param has_queries Whether the metadata has queries + * \param has_positions Whether the metadata has positions * \param nclasses Number of classes for initial scores */ - void Init(data_size_t num_data, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, int32_t nclasses); + void Init(data_size_t num_data, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, int32_t has_positions, int32_t nclasses); /*! * \brief Partition label by used indices @@ -120,6 +122,7 @@ class Metadata { void SetQuery(const ArrowChunkedArray& array); void SetPosition(const data_size_t* position, data_size_t len); + void SetPosition(const ArrowChunkedArray& array); /*! * \brief Set initial scores @@ -186,6 +189,15 @@ class Metadata { queries_[idx] = static_cast(value); } + /*! + * \brief Set Position Id for one record + * \param idx Index of this record + * \param value Position Id value of this record + */ + inline void SetPositionAt(data_size_t idx, data_size_t value) { + positions_[idx] = static_cast(value); + } + /*! \brief Load initial scores from file */ void LoadInitialScore(const std::string& data_filename); @@ -197,13 +209,15 @@ class Metadata { * \param weights Pointer to weight data, or null * \param init_scores Pointer to init-score data, or null * \param queries Pointer to query data, or null + * \param positions Pointer to position data, or null */ void InsertAt(data_size_t start_index, data_size_t count, const float* labels, const float* weights, const double* init_scores, - const int32_t* queries); + const int32_t* queries, + const int32_t* positions); /*! * \brief Perform any extra operations after all data has been loaded @@ -233,24 +247,13 @@ class Metadata { } } - /*! - * \brief Get position IDs, if does not exist then return nullptr - * \return Pointer of position IDs - */ - inline const std::string* position_ids() const { - if (!position_ids_.empty()) { - return position_ids_.data(); - } else { - return nullptr; - } - } - /*! * \brief Get Number of different position IDs * \return number of different position IDs */ inline size_t num_position_ids() const { - return position_ids_.size(); + size_t max = *std::max_element(positions_.begin(), positions_.end()); + return max + 1; } /*! @@ -354,6 +357,11 @@ class Metadata { void SetInitScoresFromIterator(It first, It last); /*! \brief Insert queries at the given index */ void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len); + /*! \brief Set positions from pointers to the first element and the end of an iterator. */ + template + void SetPositionsFromIterator(It first, It last); + /*! \brief Insert positions at the given index */ + void InsertPositions(const data_size_t* positions, data_size_t start_index, data_size_t len); /*! \brief Set queries from pointers to the first element and the end of an iterator. */ template void SetQueriesFromIterator(It first, It last); @@ -371,8 +379,6 @@ class Metadata { std::vector weights_; /*! \brief Positions data */ std::vector positions_; - /*! \brief Position identifiers */ - std::vector position_ids_; /*! \brief Query boundaries */ std::vector query_boundaries_; /*! \brief Query weights */ @@ -519,6 +525,7 @@ class Dataset { int32_t has_weights, int32_t has_init_scores, int32_t has_queries, + int32_t has_positions, int32_t nclasses, int32_t nthreads, int32_t omp_max_threads) { @@ -529,7 +536,7 @@ class Dataset { omp_max_threads_ = OMP_NUM_THREADS(); } - metadata_.Init(num_data, has_weights, has_init_scores, has_queries, nclasses); + metadata_.Init(num_data, has_weights, has_init_scores, has_queries, has_positions, nclasses); for (int i = 0; i < num_groups_; ++i) { feature_groups_[i]->InitStreaming(nthreads, omp_max_threads_); } @@ -623,8 +630,9 @@ class Dataset { const label_t* labels, const label_t* weights, const double* init_scores, - const data_size_t* queries) { - metadata_.InsertAt(start_index, count, labels, weights, init_scores, queries); + const data_size_t* queries, + const data_size_t* positions) { + metadata_.InsertAt(start_index, count, labels, weights, init_scores, queries, positions); } inline int RealFeatureIndex(int fidx) const { diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 73b8e7bfd071..817a6a4d7ffe 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -95,6 +95,8 @@ class DatasetLoader { int weight_idx_; /*! \brief index of group column */ int group_idx_; + /*! \brief index of position column */ + int position_idx_; /*! \brief Mapper from real feature index to used index*/ std::unordered_set ignore_features_; /*! \brief store feature names */ diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index ecf8749e7441..898b1ff874a7 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2042,7 +2042,9 @@ def get_params(self) -> Dict[str, Any]: "two_round", "use_missing", "weight_column", + "position_column", "zero_as_missing", + "position_column" ) return {k: v for k, v in self.params.items() if k in dataset_params} else: diff --git a/src/c_api.cpp b/src/c_api.cpp index b5b1d25e9e54..08993feea93e 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -323,6 +323,11 @@ class Booster { Log::Fatal( "Cannot change group_column after constructed Dataset handle."); } + if (new_param.count("position_column") && + new_config.position_column != old_config.position_column) { + Log::Fatal( + "Cannot change position_column after constructed Dataset handle."); + } if (new_param.count("ignore_column") && new_config.ignore_column != old_config.ignore_column) { Log::Fatal( @@ -1114,13 +1119,14 @@ int LGBM_DatasetInitStreaming(DatasetHandle dataset, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, + int32_t has_positions, int32_t nclasses, int32_t nthreads, int32_t omp_max_threads) { API_BEGIN(); auto p_dataset = reinterpret_cast(dataset); auto num_data = p_dataset->num_data(); - p_dataset->InitStreaming(num_data, has_weights, has_init_scores, has_queries, nclasses, nthreads, omp_max_threads); + p_dataset->InitStreaming(num_data, has_weights, has_init_scores, has_queries, has_positions, nclasses, nthreads, omp_max_threads); p_dataset->set_wait_for_manual_finish(true); API_END(); } @@ -1163,6 +1169,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, const float* weights, const double* init_scores, const int32_t* queries, + const int32_t* positions, int32_t tid) { API_BEGIN(); #ifdef LABEL_T_USE_DOUBLE @@ -1191,7 +1198,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, } OMP_THROW_EX(); - p_dataset->InsertMetadataAt(start_row, nrow, labels, weights, init_scores, queries); + p_dataset->InsertMetadataAt(start_row, nrow, labels, weights, init_scores, queries, positions); if (!p_dataset->wait_for_manual_finish() && (start_row + nrow == p_dataset->num_data())) { p_dataset->FinishLoad(); @@ -1245,6 +1252,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset, const float* weights, const double* init_scores, const int32_t* queries, + const int32_t* positions, int32_t tid) { API_BEGIN(); #ifdef LABEL_T_USE_DOUBLE @@ -1274,7 +1282,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset, } OMP_THROW_EX(); - p_dataset->InsertMetadataAt(static_cast(start_row), nrow, labels, weights, init_scores, queries); + p_dataset->InsertMetadataAt(static_cast(start_row), nrow, labels, weights, init_scores, queries, positions); if (!p_dataset->wait_for_manual_finish() && (start_row + nrow == static_cast(p_dataset->num_data()))) { p_dataset->FinishLoad(); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index e13cda831ea3..5631266c81bb 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -133,6 +133,8 @@ const std::unordered_map& Config::alias_table() { {"query_column", "group_column"}, {"query", "group_column"}, {"query_id", "group_column"}, + {"position", "position_column"}, + {"position_id", "position_column"}, {"ignore_feature", "ignore_column"}, {"blacklist", "ignore_column"}, {"cat_feature", "categorical_feature"}, @@ -274,6 +276,7 @@ const std::unordered_set& Config::parameter_set() { "label_column", "weight_column", "group_column", + "position_column", "ignore_column", "categorical_feature", "forcedbins_filename", @@ -552,6 +555,8 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"label_column", {"label"}}, {"weight_column", {"weight"}}, {"group_column", {"group", "group_id", "query_column", "query", "query_id"}}, + {"position_column", {"position", "position_id"}}, {"ignore_column", {"ignore_feature", "blacklist"}}, {"categorical_feature", {"cat_feature", "categorical_column", "cat_column", "categorical_features"}}, {"forcedbins_filename", {}}, @@ -1028,6 +1035,7 @@ const std::unordered_map& Config::ParameterTypes() { {"label_column", "string"}, {"weight_column", "string"}, {"group_column", "string"}, + {"position_column", "string"}, {"ignore_column", "vector"}, {"categorical_feature", "vector"}, {"forcedbins_filename", "string"}, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 01687d95c747..9e6e248cb361 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -39,7 +39,7 @@ Dataset::Dataset(data_size_t num_data) { CHECK_GT(num_data, 0); data_filename_ = "noname"; num_data_ = num_data; - metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC); + metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC, NO_SPECIFIC); is_finish_load_ = false; wait_for_manual_finish_ = false; group_bin_boundaries_.push_back(0); diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 9c8a0417b118..eb9299e94f95 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -22,6 +22,7 @@ DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& pre label_idx_ = 0; weight_idx_ = NO_SPECIFIC; group_idx_ = NO_SPECIFIC; + position_idx_ = NO_SPECIFIC; SetHeader(filename); store_raw_ = false; if (io_config.linear_tree) { @@ -165,6 +166,26 @@ void DatasetLoader::SetHeader(const char* filename) { ignore_features_.emplace(group_idx_); } } + // load position idx + if (config_.position_column.size() > 0) { + if (Common::StartsWith(config_.position_column, name_prefix)) { + std::string name = config_.position_column.substr(name_prefix.size()); + if (name2idx.count(name) > 0) { + position_idx_ = name2idx[name]; + Log::Info("Using column %s as position", name.c_str()); + } else { + Log::Fatal("Could not find position column %s in data file", name.c_str()); + } + } else { + if (!Common::AtoiAndCheck(config_.position_column.c_str(), &position_idx_)) { + Log::Fatal("position_column is not a number,\n" + "if you want to use a column name,\n" + "please add the prefix \"name:\" to the column name"); + } + Log::Info("Using column number %d as position", position_idx_); + } + ignore_features_.emplace(position_idx_); + } if (config_.categorical_feature.size() > 0) { if (Common::StartsWith(config_.categorical_feature, name_prefix)) { std::string names = config_.categorical_feature.substr(name_prefix.size()); @@ -241,7 +262,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->ResizeRaw(dataset->num_data_); } // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, position_idx_); // extract features ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get()); text_data.clear(); @@ -262,7 +283,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->ResizeRaw(dataset->num_data_); } // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, position_idx_); Log::Info("Making second pass..."); // extract features ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get()); @@ -318,7 +339,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices); dataset->num_data_ = static_cast(text_data.size()); // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, position_idx_); dataset->CreateValid(train_data); if (dataset->has_raw()) { dataset->ResizeRaw(dataset->num_data_); @@ -332,7 +353,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, dataset->num_data_ = static_cast(text_reader.CountLine()); num_global_data = dataset->num_data_; // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, position_idx_); dataset->CreateValid(train_data); if (dataset->has_raw()) { dataset->ResizeRaw(dataset->num_data_); @@ -407,7 +428,8 @@ Dataset* DatasetLoader::LoadFromSerializedReference(const char* binary_data, siz int has_weights = config_.weight_column.size() > 0; int has_init_scores = num_classes > 0; int has_queries = config_.group_column.size() > 0; - dataset->metadata_.Init(num_data, has_weights, has_init_scores, has_queries, num_classes); + int has_positions = config_.position_column.size() > 0; + dataset->metadata_.Init(num_data, has_weights, has_init_scores, has_queries, has_positions, num_classes); Log::Info("Loaded reference dataset: %d features, %d num_data", dataset->num_features_, num_data); @@ -945,6 +967,9 @@ void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binar if (config_.group_column != "") { Log::Warning("Parameter group_column works only in case of loading data directly from text file. It will be ignored when loading from binary file."); } + if (config_.position_column != "") { + Log::Warning("Parameter position_column works only in case of loading data directly from text file. It will be ignored when loading from binary file."); + } if (config_.ignore_column != "") { Log::Warning("Parameter ignore_column works only in case of loading data directly from text file. It will be ignored when loading from binary file."); } @@ -1119,6 +1144,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, } CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); CHECK(group_idx_ < 0 || group_idx_ < dataset->num_total_features_); + CHECK(position_idx_ < 0 || position_idx_ < dataset->num_total_features_); // fill feature_names_ if not header if (feature_names_.empty()) { @@ -1300,6 +1326,8 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat dataset->metadata_.SetWeightAt(i, static_cast(inner_data.second)); } else if (inner_data.first == group_idx_) { dataset->metadata_.SetQueryAt(i, static_cast(inner_data.second)); + } else if (inner_data.first == position_idx_) { + dataset->metadata_.SetPositionAt(i, static_cast(inner_data.second)); } } } @@ -1430,6 +1458,8 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* dataset->metadata_.SetWeightAt(start_idx + i, static_cast(inner_data.second)); } else if (inner_data.first == group_idx_) { dataset->metadata_.SetQueryAt(start_idx + i, static_cast(inner_data.second)); + } else if (inner_data.first == position_idx_) { + dataset->metadata_.SetPositionAt(start_idx + i, static_cast(inner_data.second)); } } } diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index f6f07c434661..444e3a7dbcc1 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -39,7 +39,7 @@ void Metadata::Init(const char* data_filename) { Metadata::~Metadata() { } -void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { +void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx, int position_idx) { num_data_ = num_data; label_ = std::vector(num_data_); if (weight_idx >= 0) { @@ -60,17 +60,27 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { queries_ = std::vector(num_data_, 0); query_load_from_file_ = false; } + if (position_idx >= 0) { + if (!positions_.empty()) { + Log::Info("Using position id in data file, ignoring the additional position file"); + positions_.clear(); + } + positions_ = std::vector(num_data_, 0); + num_positions_ = num_data_; + position_load_from_file_ = false; + } } void Metadata::InitByReference(data_size_t num_data, const Metadata* reference) { int has_weights = reference->num_weights_ > 0; int has_init_scores = reference->num_init_score_ > 0; int has_queries = reference->num_queries_ > 0; + int has_positions = reference->num_positions_ > 0; int nclasses = reference->num_init_score_classes(); - Init(num_data, has_weights, has_init_scores, has_queries, nclasses); + Init(num_data, has_weights, has_init_scores, has_queries, has_positions, nclasses); } -void Metadata::Init(data_size_t num_data, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, int32_t nclasses) { +void Metadata::Init(data_size_t num_data, int32_t has_weights, int32_t has_init_scores, int32_t has_queries, int32_t has_positions, int32_t nclasses) { num_data_ = num_data; label_ = std::vector(num_data_); if (has_weights) { @@ -95,6 +105,13 @@ void Metadata::Init(data_size_t num_data, int32_t has_weights, int32_t has_init_ queries_.resize(num_data_, 0); query_load_from_file_ = false; } + if (has_positions) { + if (!positions_.empty()) { + Log::Fatal("Calling Init() on Metadata positions that have already been initialized"); + } + positions_.resize(num_data_, 0); + position_load_from_file_ = false; + } } void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, data_size_t num_used_indices) { @@ -164,6 +181,17 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da } else { num_queries_ = 0; } + + if (!fullset.positions_.empty()) { + positions_ = std::vector(num_used_indices); + num_positions_ = num_used_indices; +#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_used_indices >= 1024) + for (data_size_t i = 0; i < num_used_indices; ++i) { + positions_[i] = fullset.positions_[used_indices[i]]; + } + } else { + num_positions_ = 0; + } } void Metadata::PartitionLabel(const std::vector& used_indices) { @@ -218,13 +246,6 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector(), array.end()); } -void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { +template +void Metadata::SetPositionsFromIterator(It first, It last) { std::lock_guard lock(mutex_); - // save to nullptr - if (positions == nullptr || len == 0) { + // Clear weights on empty input + if (last - first == 0) { positions_.clear(); num_positions_ = 0; return; @@ -578,34 +608,27 @@ void Metadata::SetPosition(const data_size_t* positions, data_size_t len) { #ifdef USE_CUDA Log::Fatal("Positions in learning to rank is not supported in CUDA version yet."); #endif // USE_CUDA - if (num_data_ != len) { - Log::Fatal("Positions size (%i) doesn't match data size (%i)", len, num_data_); + if (num_data_ != last - first) { + Log::Fatal("Length of positions differs from the length of #data"); } if (positions_.empty()) { positions_.resize(num_data_); - } else { - Log::Warning("Overwriting positions in dataset."); } num_positions_ = num_data_; - position_load_from_file_ = false; - - position_ids_.clear(); - std::unordered_map map_id2pos; + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_positions_ >= 1024) for (data_size_t i = 0; i < num_positions_; ++i) { - if (map_id2pos.count(positions[i]) == 0) { - int pos = static_cast(map_id2pos.size()); - map_id2pos[positions[i]] = pos; - position_ids_.push_back(std::to_string(positions[i])); - } + positions_[i] = first[i]; } + position_load_from_file_ = false; +} - Log::Debug("number of unique positions found = %ld", position_ids_.size()); +void Metadata::SetPosition(const data_size_t* position, data_size_t len) { + SetPositionsFromIterator(position, position + len); +} - #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_positions_ >= 1024) - for (data_size_t i = 0; i < num_positions_; ++i) { - positions_[i] = map_id2pos.at(positions[i]); - } +void Metadata::SetPosition(const ArrowChunkedArray& array) { + SetPositionsFromIterator(array.begin(), array.end()); } void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) { @@ -625,6 +648,23 @@ void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index // CUDA is handled after all insertions are complete } +void Metadata::InsertPositions(const data_size_t* positions, data_size_t start_index, data_size_t len) { + if (!positions) { + Log::Fatal("Passed null positions"); + } + if (positions_.size() <= 0) { + Log::Fatal("Inserting position data into dataset with no queries"); + } + if (static_cast(start_index + len) > positions_.size()) { + Log::Fatal("Inserted position data is too large for dataset"); + } + + memcpy(positions_.data() + start_index, positions, sizeof(data_size_t) * len); + + position_load_from_file_ = false; + // CUDA is handled after all insertions are complete +} + void Metadata::LoadWeights() { num_weights_ = 0; std::string weight_filename(data_filename_); @@ -660,15 +700,11 @@ void Metadata::LoadPositions() { Log::Info("Loading positions from %s ...", position_filename.c_str()); num_positions_ = static_cast(reader.Lines().size()); positions_ = std::vector(num_positions_); - position_ids_ = std::vector(); - std::unordered_map map_id2pos; + #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) for (data_size_t i = 0; i < num_positions_; ++i) { - std::string& line = reader.Lines()[i]; - if (map_id2pos.count(line) == 0) { - map_id2pos[line] = static_cast(position_ids_.size()); - position_ids_.push_back(line); - } - positions_[i] = map_id2pos.at(line); + int tmp_position = 0; + Common::Atoi(reader.Lines()[i].c_str(), &tmp_position); + positions_[i] = static_cast(tmp_position); } position_load_from_file_ = true; } @@ -760,7 +796,8 @@ void Metadata::InsertAt(data_size_t start_index, const float* labels, const float* weights, const double* init_scores, - const int32_t* queries) { + const int32_t* queries, + const int32_t* positions) { if (num_data_ < count + start_index) { Log::Fatal("Length of metadata is too long to append #data"); } @@ -774,6 +811,9 @@ void Metadata::InsertAt(data_size_t start_index, if (queries) { InsertQueries(queries, start_index, count); } + if (positions) { + InsertPositions(positions, start_index, count); + } } void Metadata::FinishLoad() { @@ -796,6 +836,8 @@ void Metadata::LoadFromMemory(const void* memory) { mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_weights_)); num_queries_ = *(reinterpret_cast(mem_ptr)); mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_queries_)); + num_positions_ = *(reinterpret_cast(mem_ptr)); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(num_positions_)); if (!label_.empty()) { label_.clear(); } label_ = std::vector(num_data_); @@ -817,6 +859,13 @@ void Metadata::LoadFromMemory(const void* memory) { (num_queries_ + 1)); query_load_from_file_ = true; } + if (num_positions_ > 0) { + if (!positions_.empty()) { positions_.clear(); } + positions_ = std::vector(num_positions_); + std::memcpy(positions_.data(), mem_ptr, sizeof(data_size_t) * num_positions_); + mem_ptr += VirtualFileWriter::AlignedSize(sizeof(data_size_t) * num_positions_); + position_load_from_file_ = true; + } CalculateQueryWeights(); } @@ -824,6 +873,7 @@ void Metadata::SaveBinaryToFile(BinaryWriter* writer) const { writer->AlignedWrite(&num_data_, sizeof(num_data_)); writer->AlignedWrite(&num_weights_, sizeof(num_weights_)); writer->AlignedWrite(&num_queries_, sizeof(num_queries_)); + writer->AlignedWrite(&num_positions_, sizeof(num_positions_)); writer->AlignedWrite(label_.data(), sizeof(label_t) * num_data_); if (!weights_.empty()) { writer->AlignedWrite(weights_.data(), sizeof(label_t) * num_weights_); @@ -832,6 +882,9 @@ void Metadata::SaveBinaryToFile(BinaryWriter* writer) const { writer->AlignedWrite(query_boundaries_.data(), sizeof(data_size_t) * (num_queries_ + 1)); } + if (!positions_.empty()) { + writer->AlignedWrite(positions_.data(), sizeof(data_size_t) * num_positions_); + } if (num_init_score_ > 0) { Log::Warning("Please note that `init_score` is not saved in binary file.\n" "If you need it, please set it again after loading Dataset."); @@ -841,7 +894,8 @@ void Metadata::SaveBinaryToFile(BinaryWriter* writer) const { size_t Metadata::SizesInByte() const { size_t size = VirtualFileWriter::AlignedSize(sizeof(num_data_)) + VirtualFileWriter::AlignedSize(sizeof(num_weights_)) + - VirtualFileWriter::AlignedSize(sizeof(num_queries_)); + VirtualFileWriter::AlignedSize(sizeof(num_queries_)) + + VirtualFileWriter::AlignedSize(sizeof(num_positions_)); size += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_data_); if (!weights_.empty()) { size += VirtualFileWriter::AlignedSize(sizeof(label_t) * num_weights_); @@ -850,6 +904,9 @@ size_t Metadata::SizesInByte() const { size += VirtualFileWriter::AlignedSize(sizeof(data_size_t) * (num_queries_ + 1)); } + if (!positions_.empty()) { + size += VirtualFileWriter::AlignedSize(sizeof(data_size_t) * num_positions_); + } return size; } diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 8227c7b65658..7411bea48be9 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -43,9 +43,9 @@ class RankingObjective : public ObjectiveFunction { // get positions positions_ = metadata.positions(); // get position ids - position_ids_ = metadata.position_ids(); // get number of different position ids num_position_ids_ = static_cast(metadata.num_position_ids()); + Log::Info("Number of different position ids: %d", num_position_ids_); // get boundaries query_boundaries_ = metadata.query_boundaries(); if (query_boundaries_ == nullptr) { @@ -118,9 +118,7 @@ class RankingObjective : public ObjectiveFunction { const label_t* weights_; /*! \brief Pointer of positions */ const data_size_t* positions_; - /*! \brief Pointer of position IDs */ - const std::string* position_ids_; - /*! \brief Pointer of label */ + /*! \brief number of position ids */ data_size_t num_position_ids_; /*! \brief Query boundaries */ const data_size_t* query_boundaries_; @@ -343,7 +341,7 @@ class LambdarankNDCG : public RankingObjective { Log::Debug(message_stream.str().c_str()); message_stream.str(""); for (int i = 0; i < num_position_ids_; ++i) { - message_stream << std::setw(15) << position_ids_[i] + message_stream << std::setw(15) << i << std::setw(15) << pos_biases_[i]; Log::Debug(message_stream.str().c_str()); message_stream.str(""); diff --git a/tests/cpp_tests/test_stream.cpp b/tests/cpp_tests/test_stream.cpp index a656af1e2fe9..4b54276868b5 100644 --- a/tests/cpp_tests/test_stream.cpp +++ b/tests/cpp_tests/test_stream.cpp @@ -26,7 +26,8 @@ void test_stream_dense( const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups, + const std::vector* positions) { Log::Info("Streaming %d rows dense data with a batch size of %d", nrows, batch_count); DatasetHandle dataset_handle = nullptr; Dataset* dataset = nullptr; @@ -34,6 +35,7 @@ void test_stream_dense( int has_weights = weights != nullptr; int has_init_scores = init_scores != nullptr; int has_queries = groups != nullptr; + int has_positions = positions != nullptr; bool succeeded = true; std::string exceptionText(""); @@ -79,7 +81,7 @@ void test_stream_dense( &dataset_handle); EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSampledColumn result code: " << result; - result = LGBM_DatasetInitStreaming(dataset_handle, has_weights, has_init_scores, has_queries, nclasses, 1, -1); + result = LGBM_DatasetInitStreaming(dataset_handle, has_weights, has_init_scores, has_queries, has_positions, nclasses, 1, -1); EXPECT_EQ(0, result) << "LGBM_DatasetInitStreaming result code: " << result; break; } @@ -104,7 +106,8 @@ void test_stream_dense( labels, weights, init_scores, - groups); + groups, + positions); dataset->FinishLoad(); @@ -112,7 +115,8 @@ void test_stream_dense( labels, weights, init_scores, - groups); + groups, + positions); } catch (std::exception& ex) { succeeded = false; @@ -142,7 +146,8 @@ void test_stream_sparse( const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups, + const std::vector* positions) { Log::Info("Streaming %d rows sparse data with a batch size of %d", nrows, batch_count); DatasetHandle dataset_handle = nullptr; Dataset* dataset = nullptr; @@ -150,6 +155,7 @@ void test_stream_sparse( int has_weights = weights != nullptr; int has_init_scores = init_scores != nullptr; int has_queries = groups != nullptr; + int has_positions = positions != nullptr; bool succeeded = true; std::string exceptionText(""); @@ -197,7 +203,7 @@ void test_stream_sparse( EXPECT_EQ(0, result) << "LGBM_DatasetCreateFromSampledColumn result code: " << result; dataset = static_cast(dataset_handle); - dataset->InitStreaming(nrows, has_weights, has_init_scores, has_queries, nclasses, 2, -1); + dataset->InitStreaming(nrows, has_weights, has_init_scores, has_queries, has_positions, nclasses, 2, -1); break; } @@ -222,7 +228,8 @@ void test_stream_sparse( labels, weights, init_scores, - groups); + groups, + positions); dataset->FinishLoad(); @@ -230,7 +237,8 @@ void test_stream_sparse( labels, weights, init_scores, - groups); + groups, + positions); } catch (std::exception& ex) { succeeded = false; @@ -266,10 +274,14 @@ TEST(Stream, PushDenseRowsWithMetadata) { unused_init_scores.resize(noriginalrows * nclasses); std::vector unused_groups; unused_groups.assign(noriginalrows, 1); + std::vector unused_positions; + unused_positions.assign(noriginalrows, 1); result = LGBM_DatasetSetField(ref_dataset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result; result = LGBM_DatasetSetField(ref_dataset_handle, "group", unused_groups.data(), noriginalrows, 2); EXPECT_EQ(0, result) << "LGBM_DatasetSetField group result code: " << result; + result = LGBM_DatasetSetField(ref_dataset_handle, "position", unused_positions.data(), noriginalrows, 2); + EXPECT_EQ(0, result) << "LGBM_DatasetSetField position result code: " << result; // Now use the reference dataset schema to make some testable Datasets with N rows each int32_t nrows = 1000; @@ -279,9 +291,10 @@ TEST(Stream, PushDenseRowsWithMetadata) { std::vector weights; std::vector init_scores; std::vector groups; + std::vector positions; Log::Info("Creating random data"); - TestUtils::CreateRandomDenseData(nrows, ncols, nclasses, &features, &labels, &weights, &init_scores, &groups); + TestUtils::CreateRandomDenseData(nrows, ncols, nclasses, &features, &labels, &weights, &init_scores, &groups, &positions); const std::vector batch_counts = { 1, nrows / 100, nrows / 10, nrows }; const std::vector creation_types = { 0, 1 }; @@ -290,7 +303,7 @@ TEST(Stream, PushDenseRowsWithMetadata) { for (size_t j = 0; j < batch_counts.size(); ++j) { auto type = creation_types[i]; auto batch_count = batch_counts[j]; - test_stream_dense(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &features, &labels, &weights, &init_scores, &groups); + test_stream_dense(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &features, &labels, &weights, &init_scores, &groups, &positions); } } @@ -317,10 +330,14 @@ TEST(Stream, PushSparseRowsWithMetadata) { unused_init_scores.resize(noriginalrows * nclasses); std::vector unused_groups; unused_groups.assign(noriginalrows, 1); + std::vector unused_positions; + unused_positions.assign(noriginalrows, 1); result = LGBM_DatasetSetField(ref_dataset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result; result = LGBM_DatasetSetField(ref_dataset_handle, "group", unused_groups.data(), noriginalrows, 2); EXPECT_EQ(0, result) << "LGBM_DatasetSetField group result code: " << result; + result = LGBM_DatasetSetField(ref_dataset_handle, "position", unused_positions.data(), noriginalrows, 2); + EXPECT_EQ(0, result) << "LGBM_DatasetSetField position result code: " << result; // Now use the reference dataset schema to make some testable Datasets with N rows each int32_t nrows = 1000; @@ -332,10 +349,11 @@ TEST(Stream, PushSparseRowsWithMetadata) { std::vector weights; std::vector init_scores; std::vector groups; + std::vector positions; Log::Info("Creating random data"); float sparse_percent = .1f; - TestUtils::CreateRandomSparseData(nrows, ncols, nclasses, sparse_percent, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups); + TestUtils::CreateRandomSparseData(nrows, ncols, nclasses, sparse_percent, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups, &positions); const std::vector batch_counts = { 1, nrows / 100, nrows / 10, nrows }; const std::vector creation_types = { 0, 1 }; @@ -344,7 +362,7 @@ TEST(Stream, PushSparseRowsWithMetadata) { for (size_t j = 0; j < batch_counts.size(); ++j) { auto type = creation_types[i]; auto batch_count = batch_counts[j]; - test_stream_sparse(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups); + test_stream_sparse(type, ref_dataset_handle, nrows, ncols, nclasses, batch_count, &indptr, &indices, &vals, &labels, &weights, &init_scores, &groups, &positions); } } diff --git a/tests/cpp_tests/testutils.cpp b/tests/cpp_tests/testutils.cpp index b2dde31b3755..b6dc84541fcd 100644 --- a/tests/cpp_tests/testutils.cpp +++ b/tests/cpp_tests/testutils.cpp @@ -42,7 +42,8 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups, + std::vector* positions) { Random rand(42); features->reserve(nrows * ncols); @@ -52,7 +53,7 @@ namespace LightGBM { } } - CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups); + CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups, positions); } /*! @@ -69,7 +70,8 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups, + std::vector* positions) { Random rand(42); indptr->reserve(static_cast(nrows + 1)); indices->reserve(static_cast(sparse_percent * nrows * ncols)); @@ -87,7 +89,7 @@ namespace LightGBM { indptr->push_back(static_cast(indices->size() - 1)); } - CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups); + CreateRandomMetadata(nrows, nclasses, labels, weights, init_scores, groups, positions); } /*! @@ -98,7 +100,8 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups, + std::vector* positions) { Random rand(42); labels->reserve(nrows); if (weights) { @@ -110,8 +113,12 @@ namespace LightGBM { if (groups) { groups->reserve(nrows); } + if (positions) { + positions->reserve(nrows); + } int32_t group = 0; + int32_t position = 0; for (int32_t row = 0; row < nrows; row++) { labels->push_back(rand.NextFloat()); @@ -126,9 +133,14 @@ namespace LightGBM { if (groups) { if (rand.NextFloat() > 0.95) { group++; + position = 0; } groups->push_back(group); } + if (positions) { + positions->push_back(position); + position++; + } } } @@ -141,7 +153,8 @@ namespace LightGBM { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups, + const std::vector* positions) { int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result; @@ -170,6 +183,11 @@ namespace LightGBM { groups_ptr = groups->data(); } + const int32_t* positions_ptr = nullptr; + if (positions) { + positions_ptr = positions->data(); + } + auto start_time = std::chrono::steady_clock::now(); for (int32_t i = 0; i < nrows; i += batch_count) { @@ -187,6 +205,7 @@ namespace LightGBM { weights_ptr, init_scores_ptr, groups_ptr, + positions_ptr, 0); EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsWithMetadata result code: " << result; if (result != 0) { @@ -201,6 +220,9 @@ namespace LightGBM { if (groups_ptr) { groups_ptr += batch_count; } + if (positions_ptr) { + positions_ptr += batch_count; + } } auto cur_time = std::chrono::steady_clock::now(); @@ -217,7 +239,8 @@ namespace LightGBM { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups, + const std::vector* positions) { int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result; @@ -240,6 +263,11 @@ namespace LightGBM { groups_ptr = groups->data(); } + const int32_t* positions_ptr = nullptr; + if (positions) { + positions_ptr = positions->data(); + } + auto start_time = std::chrono::steady_clock::now(); // Use multiple threads to test concurrency @@ -263,6 +291,7 @@ namespace LightGBM { weights_ptr, init_scores, groups_ptr, + positions_ptr, thread_count, t); threads.push_back(std::move(th)); @@ -291,6 +320,7 @@ namespace LightGBM { const float* weights_ptr, const std::vector* init_scores, const int32_t* groups_ptr, + const int32_t* positions_ptr, int32_t thread_count, int32_t thread_id) { int32_t threadChunkSize = nrows / thread_count; @@ -305,6 +335,9 @@ namespace LightGBM { if (groups_ptr) { groups_ptr += threadChunkSize * thread_id; } + if (positions_ptr) { + positions_ptr += threadChunkSize * thread_id; + } for (int32_t i = startIndex; i < stopIndex; i += batch_count) { // Since init_scores are in a column format, but need to be pushed as rows, we have to extract each batch @@ -330,6 +363,7 @@ namespace LightGBM { weights_ptr, init_scores_ptr, groups_ptr, + positions_ptr, thread_id); EXPECT_EQ(0, result) << "LGBM_DatasetPushRowsByCSRWithMetadata result code: " << result; if (result != 0) { @@ -344,6 +378,9 @@ namespace LightGBM { if (groups_ptr) { groups_ptr += batch_count; } + if (positions_ptr) { + positions_ptr += batch_count; + } } } @@ -352,7 +389,8 @@ namespace LightGBM { const std::vector* ref_labels, const std::vector* ref_weights, const std::vector* ref_init_scores, - const std::vector* ref_groups) { + const std::vector* ref_groups, + const std::vector* ref_positions) { const float* labels = metadata->label(); auto nTotal = static_cast(ref_labels->size()); for (auto i = 0; i < nTotal; i++) { @@ -418,6 +456,21 @@ namespace LightGBM { } else if (ref_groups) { FAIL() << "Expected non-null query_boundaries"; } + + const int32_t* positions = metadata->positions(); + if (positions) { + if (!ref_positions) { + FAIL() << "Expected null positions"; + } + for (auto i = 0; i < nTotal; i++) { + EXPECT_EQ(ref_positions->at(i), positions[i]) << "Inserted data: " << ref_positions->at(i); + if (ref_positions->at(i) != positions[i]) { + FAIL() << "Mismatched positions"; // This forces an immediate failure, which EXPECT_EQ does not + } + } + } else if (ref_positions) { + FAIL() << "Expected non-null positions"; + } } const double* TestUtils::CreateInitScoreBatch(std::vector* init_score_batch, diff --git a/tests/cpp_tests/testutils.h b/tests/cpp_tests/testutils.h index 158b38e0e24b..1f121dda8ed4 100644 --- a/tests/cpp_tests/testutils.h +++ b/tests/cpp_tests/testutils.h @@ -32,7 +32,8 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups, + std::vector* positions); /*! * Creates a CSR sparse Dataset of random values. @@ -47,7 +48,8 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups, + std::vector* positions); /*! * Creates a batch of Metadata of random values. @@ -57,7 +59,8 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups, + std::vector* positions); /*! * Pushes nrows of data to a Dataset in batches of batch_count. @@ -71,7 +74,8 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups, + const std::vector* positions); /*! * Pushes nrows of data to a Dataset in batches of batch_count. @@ -86,7 +90,8 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups, + const std::vector* positions); /*! * Validates metadata against reference vectors. @@ -95,7 +100,8 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups, + const std::vector* positions); static const double* CreateInitScoreBatch(std::vector* init_score_batch, int32_t index, @@ -117,6 +123,7 @@ class TestUtils { const float* weights_ptr, const std::vector* init_scores, const int32_t* groups_ptr, + const int32_t* positions_ptr, int32_t thread_count, int32_t thread_id); }; diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 47d1f5e3349d..fac53b99bf2b 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1654,6 +1654,7 @@ def test_all_expected_params_are_written_out_to_model_text(tmp_path): "[label_column: ]", "[weight_column: ]", "[group_column: ]", + "[position_column: ]", "[ignore_column: ]", "[categorical_feature: ]", "[forcedbins_filename: ]", @@ -3432,6 +3433,7 @@ def test_dataset_update_params(rng): "label_column": 0, "weight_column": 0, "group_column": 0, + "position_column": 0, "ignore_column": 0, "min_data_in_leaf": 10, "linear_tree": False, @@ -3456,6 +3458,7 @@ def test_dataset_update_params(rng): "label_column": 1, "weight_column": 1, "group_column": 1, + "position_column": 1, "ignore_column": 1, "forcedbins_filename": "/some/path/forcedbins.json", "min_data_in_leaf": 2, From c020801f6ff8542325e50b98bd4d47de1dbf6679 Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Sat, 15 Feb 2025 13:48:39 -0800 Subject: [PATCH 2/9] fix --- docs/Parameters.rst | 6 +++--- src/io/config_auto.cpp | 3 ++- src/io/dataset_loader.cpp | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index a355204afe05..24843e89ca79 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -925,11 +925,11 @@ Dataset Parameters - **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0`` - - ``position_column`` :raw-html:`🔗︎`, default = ``""``, type = int or string, aliases: ``position``, ``position_id`` +- ``position_column`` :raw-html:`🔗︎`, default = ``""``, type = int or string, aliases: ``position``, ``position_id``, ``position_column`` - - used to specify the position id column + - used to specify the position/position id column - - use number for index, e.g. ``position=0`` means column\_0 is the position + - use number for index, e.g. ``position=0`` means column\_0 is the position id - add a prefix ``name:`` for column name, e.g. ``position=name:position_id`` diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 5631266c81bb..61d409f97a67 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -135,6 +135,7 @@ const std::unordered_map& Config::alias_table() { {"query_id", "group_column"}, {"position", "position_column"}, {"position_id", "position_column"}, + {"position_column", "position_column"}, {"ignore_feature", "ignore_column"}, {"blacklist", "ignore_column"}, {"cat_feature", "categorical_feature"}, @@ -889,7 +890,7 @@ const std::unordered_map>& Config::paramet {"label_column", {"label"}}, {"weight_column", {"weight"}}, {"group_column", {"group", "group_id", "query_column", "query", "query_id"}}, - {"position_column", {"position", "position_id"}}, + {"position_column", {"position", "position_id", "position_column"}}, {"ignore_column", {"ignore_feature", "blacklist"}}, {"categorical_feature", {"cat_feature", "categorical_column", "cat_column", "categorical_features"}}, {"forcedbins_filename", {}}, diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index eb9299e94f95..3f33fe0c7ab6 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -1385,6 +1385,8 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat dataset->metadata_.SetWeightAt(i, static_cast(inner_data.second)); } else if (inner_data.first == group_idx_) { dataset->metadata_.SetQueryAt(i, static_cast(inner_data.second)); + } else if (inner_data.first == position_idx_) { + dataset->metadata_.SetPositionAt(i, static_cast(inner_data.second)); } } } From a90a3d41f8f400773aa3c2cd30f240140143d514 Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Sat, 15 Feb 2025 15:59:17 -0800 Subject: [PATCH 3/9] fix --- python-package/lightgbm/basic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 898b1ff874a7..b51faeefde14 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2043,8 +2043,7 @@ def get_params(self) -> Dict[str, Any]: "use_missing", "weight_column", "position_column", - "zero_as_missing", - "position_column" + "zero_as_missing" ) return {k: v for k, v in self.params.items() if k in dataset_params} else: From 46992dda504fd3b7dbe5b305bf90f5a6514afa9f Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Sat, 15 Feb 2025 21:44:10 -0800 Subject: [PATCH 4/9] fixes --- include/LightGBM/dataset.h | 11 +++++++++-- src/objective/rank_objective.hpp | 1 - 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 8ef36a573a48..fa2f570baacc 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -252,8 +252,15 @@ class Metadata { * \return number of different position IDs */ inline size_t num_position_ids() const { - size_t max = *std::max_element(positions_.begin(), positions_.end()); - return max + 1; + if (!positions_.empty()) { + size_t max = *std::max_element(positions_.begin(), positions_.end()); + return max + 1; + } + else + { + return 0; + } + } /*! diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp index 7411bea48be9..18f4423f6b3d 100644 --- a/src/objective/rank_objective.hpp +++ b/src/objective/rank_objective.hpp @@ -45,7 +45,6 @@ class RankingObjective : public ObjectiveFunction { // get position ids // get number of different position ids num_position_ids_ = static_cast(metadata.num_position_ids()); - Log::Info("Number of different position ids: %d", num_position_ids_); // get boundaries query_boundaries_ = metadata.query_boundaries(); if (query_boundaries_ == nullptr) { From 556c6512553e55f682e44e00dfb207df1c8d4aee Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Sun, 16 Feb 2025 13:40:57 -0800 Subject: [PATCH 5/9] fix some lint errors --- include/LightGBM/c_api.h | 2 +- include/LightGBM/dataset.h | 5 +---- src/io/metadata.cpp | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 5d2455cf5d94..b5cfe53e552c 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -249,7 +249,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, const float* weight, const double* init_score, const int32_t* query, - const int32_t* position, + const int32_t* position, int32_t tid); /*! diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index fa2f570baacc..4577ef21d424 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -255,12 +255,9 @@ class Metadata { if (!positions_.empty()) { size_t max = *std::max_element(positions_.begin(), positions_.end()); return max + 1; - } - else - { + } else { return 0; } - } /*! diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 444e3a7dbcc1..a05ff3a1d084 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -266,7 +266,6 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector Date: Sun, 16 Feb 2025 13:58:32 -0800 Subject: [PATCH 6/9] fix ruff errors --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b51faeefde14..62afe435fc44 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2043,7 +2043,7 @@ def get_params(self) -> Dict[str, Any]: "use_missing", "weight_column", "position_column", - "zero_as_missing" + "zero_as_missing", ) return {k: v for k, v in self.params.items() if k in dataset_params} else: From 0b66061135d3c7564ea8b1763ef87f3cf2356444 Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Wed, 26 Feb 2025 18:43:40 -0800 Subject: [PATCH 7/9] add tests --- tests/python_package_test/test_engine.py | 78 ++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index fac53b99bf2b..c517f64af780 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -847,6 +847,84 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path): positions_from_get = lgb_train.get_position() np.testing.assert_array_equal(positions_from_get, positions) +# Appends queries and positions to the dataset file +def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positions, out_path): + queries = [] + query_id = 0 + with open(file_query_in, "r") as f: + for line in f: + query_count = int(line.strip()) + queries.extend([query_id] * query_count) + query_id += 1 + with open(file_dataset_in, "r") as f_in: + with open(out_path, "w") as f_out: + if positions is not None: + for line, query, position in zip(f_in, queries, positions): + f_out.write(f"{line.strip()} 301:{query} 302:{position}\n") + else: + for line, query in zip(f_in, queries): + # adding dummy position + f_out.write(f"{line.strip()} 301:{query} 302:1\n") + + +@pytest.mark.skipif( + getenv("TASK", "") == "cuda", reason="Positions in learning to rank is not supported in CUDA version yet" +) +def test_ranking_with_position_and_group_information_in_single_file(tmp_path): + rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank" + + # simulate position bias for the train dataset and put the train dataset with biased labels to temp directory + positions = simulate_position_bias( + str(rank_example_dir / "rank.train"), + str(rank_example_dir / "rank.train.query"), + str(tmp_path / "rank.intermediate"), + baseline_feature=34, + ) + + # append queries amd positions to the dataset file. They will have 301 and 302 feature indexes + append_queries_and_positions_to_file(str(tmp_path / "rank.intermediate"), str(rank_example_dir / "rank.train.query"), positions, str(tmp_path / "rank.train")) + append_queries_and_positions_to_file(str(rank_example_dir / "rank.test"), str(rank_example_dir / "rank.test.query"), None, str(tmp_path / "rank.test")) + + # Training with single file + params = { + "objective": "lambdarank", + "verbose": -1, + "eval_at": [3], + "metric": "ndcg", + "bagging_freq": 1, + "bagging_fraction": 0.9, + "min_data_in_leaf": 50, + "min_sum_hessian_in_leaf": 5.0, + "group_column": 301, + "position_column": 302, + "label_column": 0 + } + + lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params) + lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"), params=params)] + gbm_unbiased_with_single_file = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) + + # Training with query files and list of positions + params = { + "objective": "lambdarank", + "verbose": -1, + "eval_at": [3], + "metric": "ndcg", + "bagging_freq": 1, + "bagging_fraction": 0.9, + "min_data_in_leaf": 50, + "min_sum_hessian_in_leaf": 5.0, + # ignore position and group column + "ignore_column": "301,302" + } + copyfile(str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query")) + copyfile(str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query")) + lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params, position=positions) + lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"))] + gbm_unbiased_with_multiple_files = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) + # the performance of the unbiased LambdaMART when using query files and list of positions should match the performance of the unbiased LambdaMART when using single file with group and position columns + assert gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"] == gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"] + def test_early_stopping(): X, y = load_breast_cancer(return_X_y=True) From 9c38551f18d6fb8a29f4848e8f07a4a79680064e Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Wed, 26 Feb 2025 18:55:47 -0800 Subject: [PATCH 8/9] ruff format new test --- tests/python_package_test/test_engine.py | 51 ++++++++++++++++++------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c517f64af780..9c8c27bdd793 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -847,8 +847,11 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path): positions_from_get = lgb_train.get_position() np.testing.assert_array_equal(positions_from_get, positions) + # Appends queries and positions to the dataset file -def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positions, out_path): +def append_queries_and_positions_to_file( + file_dataset_in, file_query_in, positions, out_path +): queries = [] query_id = 0 with open(file_query_in, "r") as f: @@ -868,7 +871,8 @@ def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positio @pytest.mark.skipif( - getenv("TASK", "") == "cuda", reason="Positions in learning to rank is not supported in CUDA version yet" + getenv("TASK", "") == "cuda", + reason="Positions in learning to rank is not supported in CUDA version yet", ) def test_ranking_with_position_and_group_information_in_single_file(tmp_path): rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank" @@ -882,8 +886,18 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path): ) # append queries amd positions to the dataset file. They will have 301 and 302 feature indexes - append_queries_and_positions_to_file(str(tmp_path / "rank.intermediate"), str(rank_example_dir / "rank.train.query"), positions, str(tmp_path / "rank.train")) - append_queries_and_positions_to_file(str(rank_example_dir / "rank.test"), str(rank_example_dir / "rank.test.query"), None, str(tmp_path / "rank.test")) + append_queries_and_positions_to_file( + str(tmp_path / "rank.intermediate"), + str(rank_example_dir / "rank.train.query"), + positions, + str(tmp_path / "rank.train"), + ) + append_queries_and_positions_to_file( + str(rank_example_dir / "rank.test"), + str(rank_example_dir / "rank.test.query"), + None, + str(tmp_path / "rank.test"), + ) # Training with single file params = { @@ -897,12 +911,14 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path): "min_sum_hessian_in_leaf": 5.0, "group_column": 301, "position_column": 302, - "label_column": 0 + "label_column": 0, } lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params) lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"), params=params)] - gbm_unbiased_with_single_file = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) + gbm_unbiased_with_single_file = lgb.train( + params, lgb_train, valid_sets=lgb_valid, num_boost_round=50 + ) # Training with query files and list of positions params = { @@ -915,15 +931,26 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path): "min_data_in_leaf": 50, "min_sum_hessian_in_leaf": 5.0, # ignore position and group column - "ignore_column": "301,302" + "ignore_column": "301,302", } - copyfile(str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query")) - copyfile(str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query")) - lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params, position=positions) + copyfile( + str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query") + ) + copyfile( + str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query") + ) + lgb_train = lgb.Dataset( + str(tmp_path / "rank.train"), params=params, position=positions + ) lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"))] - gbm_unbiased_with_multiple_files = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) + gbm_unbiased_with_multiple_files = lgb.train( + params, lgb_train, valid_sets=lgb_valid, num_boost_round=50 + ) # the performance of the unbiased LambdaMART when using query files and list of positions should match the performance of the unbiased LambdaMART when using single file with group and position columns - assert gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"] == gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"] + assert ( + gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"] + == gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"] + ) def test_early_stopping(): From 93ba7e18caea0efd3bf370cfdee6ac743c7f75eb Mon Sep 17 00:00:00 2001 From: nprokoptsev Date: Wed, 26 Feb 2025 19:07:04 -0800 Subject: [PATCH 9/9] ruff format test --- tests/python_package_test/test_engine.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9c8c27bdd793..2e21d0221f4a 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -849,9 +849,7 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path): # Appends queries and positions to the dataset file -def append_queries_and_positions_to_file( - file_dataset_in, file_query_in, positions, out_path -): +def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positions, out_path): queries = [] query_id = 0 with open(file_query_in, "r") as f: @@ -916,9 +914,7 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path): lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params) lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"), params=params)] - gbm_unbiased_with_single_file = lgb.train( - params, lgb_train, valid_sets=lgb_valid, num_boost_round=50 - ) + gbm_unbiased_with_single_file = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) # Training with query files and list of positions params = { @@ -933,19 +929,11 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path): # ignore position and group column "ignore_column": "301,302", } - copyfile( - str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query") - ) - copyfile( - str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query") - ) - lgb_train = lgb.Dataset( - str(tmp_path / "rank.train"), params=params, position=positions - ) + copyfile(str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query")) + copyfile(str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query")) + lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params, position=positions) lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"))] - gbm_unbiased_with_multiple_files = lgb.train( - params, lgb_train, valid_sets=lgb_valid, num_boost_round=50 - ) + gbm_unbiased_with_multiple_files = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50) # the performance of the unbiased LambdaMART when using query files and list of positions should match the performance of the unbiased LambdaMART when using single file with group and position columns assert ( gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"]