diff --git a/RegressionTree.py b/RegressionTree.py index cd84f7a..c3bed9c 100644 --- a/RegressionTree.py +++ b/RegressionTree.py @@ -2,212 +2,224 @@ # input is a dataframe of features # the corresponding y value(called labels here) is the scores for each document -import pandas as pd -import numpy as np -from multiprocessing import Pool from itertools import repeat +from multiprocessing import Pool + +import numpy as np +import pandas as pd import scipy import scipy.optimize node_id = 0 + def get_splitting_points(args): - # given a list - # return a list of possible splitting values - attribute, col = args - attribute.sort() - possible_split = [] - for i in range(len(attribute)-1): - if attribute[i] != attribute[i+1]: - possible_split.append(np.mean((attribute[i],attribute[i+1]))) - return possible_split, col + # given a list + # return a list of possible splitting values + attribute, col = args + attribute.sort() + possible_split = [] + for i in range(len(attribute) - 1): + if attribute[i] != attribute[i + 1]: + possible_split.append(np.mean((attribute[i], attribute[i + 1]))) + return possible_split, col + # create a dictionary, key is the attribute number, value is whole list of possible splits for that column def find_best_split_parallel(args): - best_ls = 1000000 - best_split = None - best_children = None - split_point, data, label = args - key,possible_split = split_point - - for split in possible_split: - children = split_children(data, label, key, split) - - #weighted average of left and right ls - ls = len(children[1])*least_square(children[1])/len(label) + len(children[3])*least_square(children[3])/len(label) - if ls < best_ls: - best_ls = ls - best_split = (key, split) - best_children = children - return best_ls, best_split, best_children - -def find_best_split(data, label, split_points): - # split_points is a dictionary of possible splitting values - # return the best split - best_ls = 1000000 - best_split = None - best_children = None - pool = Pool() - for ls, split, children in pool.map(find_best_split_parallel, zip(split_points.items(), repeat(data), repeat(label))): - if ls < best_ls: - best_ls = ls - best_split = split - best_children = children - pool.close() - - - - - return best_split, best_children # return a tuple(attribute, value) + best_ls = 1000000 + best_split = None + best_children = None + split_point, data, label = args + key, possible_split = split_point + + for split in possible_split: + children = split_children(data, label, key, split) + + # weighted average of left and right ls + ls = len(children[1]) * least_square(children[1]) / len(label) + len(children[3]) * least_square( + children[3]) / len(label) + if ls < best_ls: + best_ls = ls + best_split = (key, split) + best_children = children + return best_ls, best_split, best_children + + +def find_best_split(data, label, split_points): + # split_points is a dictionary of possible splitting values + # return the best split + best_ls = 1000000 + best_split = None + best_children = None + pool = Pool() + for ls, split, children in pool.map(find_best_split_parallel, + zip(split_points.items(), repeat(data), repeat(label))): + if ls < best_ls: + best_ls = ls + best_split = split + best_children = children + pool.close() + + return best_split, best_children # return a tuple(attribute, value) + def split_children(data, label, key, split): - left_index = [index for index in xrange(len(data.iloc[:,key])) if data.iloc[index,key] < split] - right_index = [index for index in xrange(len(data.iloc[:,key])) if data.iloc[index,key] >= split] - left_data = data.iloc[left_index,:] - right_data = data.iloc[right_index,:] - left_label = [label[i] for i in left_index] - right_label =[label[i] for i in right_index] - - return left_data, left_label, right_data, right_label - -def least_square(label): - if not len(label): - return 0 - return (np.sum(label)**2)/len(set(label)) - - -def create_leaf(label): - global node_id - node_id += 1 - leaf = {'splittng_feature': None, - 'left': None, - 'right':None, - 'is_leaf':True, - 'index':node_id} - leaf['value'] = round(np.mean(label),3) - return leaf - -def find_splits_parallel(args): - var_space, label, col = args - # var_space = data.iloc[:,col].tolist() - return scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, var_space, label), full_output = 1) - # return, - # if not min_error or error < min_error: - # min_error = error - # split_var = col - # min_split = split - -def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth = 0): - remaining_features = all_pos_split - #stopping conditions - if sum([len(v)!= 0 for v in remaining_features.values()]) == 0: - # If there are no remaining features to consider, make current node a leaf node - return create_leaf(label) - # #Additional stopping condition (limit tree depth) - elif current_depth > max_depth: - return create_leaf(label) - - - ####### - min_error = None - split_var = None - min_split = None - - var_spaces = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])] - cols = [col for col in xrange(data.shape[1])] - pool = Pool() - for split, error, ierr, numf in pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols)): - if not min_error or error < min_error: - min_error = error - split_var = col - min_split = split - pool.close() - - splitting_feature = (split_var, min_split) - children = split_children(data, label, split_var, min_split) - - left_data, left_label, right_data, right_label = children - if len(left_label) == 0 or len(right_label) == 0: - return create_leaf(label) - - left_least_square = least_square(left_label) - - # Create a leaf node if the split is "perfect" - if left_least_square < ideal_ls: - return create_leaf(left_label) - if least_square(right_label) < ideal_ls: - return create_leaf(right_label) - - # recurse on children - left_tree = create_tree(left_data, remaining_features, left_label, max_depth, ideal_ls, current_depth +1) - right_tree = create_tree(right_data, remaining_features, right_label, max_depth, ideal_ls, current_depth +1) - return {'is_leaf' : False, - 'value' : None, - 'splitting_feature': splitting_feature, - 'left' : left_tree, - 'right' : right_tree, - 'index' : None} + left_index = [index for index in range(len(data.iloc[:, key])) if data.iloc[index, key] < split] + right_index = [index for index in range(len(data.iloc[:, key])) if data.iloc[index, key] >= split] + left_data = data.iloc[left_index, :] + right_data = data.iloc[right_index, :] + left_label = [label[i] for i in left_index] + right_label = [label[i] for i in right_index] + + return left_data, left_label, right_data, right_label + + +def least_square(label: list): + if not len(label): + return 0 + return (np.sum(label) ** 2) / len(set(label)) + + +def create_leaf(label: list): + global node_id + node_id += 1 + leaf = { + 'splittng_feature': None, 'left': None, 'right': None, 'is_leaf': True, 'index': node_id, + 'value': round(np.mean(label), 3) + } + return leaf + + +def find_splits_parallel(args: tuple): + var_space, label, col = args + # var_space = data.iloc[:,col].tolist() + return scipy.optimize.fminbound( + error_function, min(var_space), max(var_space), args=(col, var_space, label), + full_output=True + ) + + +# return, +# if not min_error or error < min_error: +# min_error = error +# split_var = col +# min_split = split + +def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth=0): + remaining_features = all_pos_split + # stopping conditions + if sum([len(v) != 0 for v in remaining_features.values()]) == 0: + # If there are no remaining features to consider, make current node a leaf node + return create_leaf(label) + # #Additional stopping condition (limit tree depth) + elif current_depth > max_depth: + return create_leaf(label) + + ####### + min_error = None + split_var = None + min_split = None + + var_spaces = [data.iloc[:, col].tolist() for col in range(data.shape[1])] + cols = [col for col in range(data.shape[1])] + pool = Pool() + for col, split, error, ierr, numf in enumerate( + pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols))): + if not min_error or error < min_error: + min_error = error + split_var = col + min_split = split + pool.close() + + splitting_feature = (split_var, min_split) + children = split_children(data, label, split_var, min_split) + + left_data, left_label, right_data, right_label = children + if len(left_label) == 0 or len(right_label) == 0: + return create_leaf(label) + + left_least_square = least_square(left_label) + + # Create a leaf node if the split is "perfect" + if left_least_square < ideal_ls: + return create_leaf(left_label) + if least_square(right_label) < ideal_ls: + return create_leaf(right_label) + + # recurse on children + left_tree = create_tree(left_data, remaining_features, left_label, max_depth, ideal_ls, current_depth + 1) + right_tree = create_tree(right_data, remaining_features, right_label, max_depth, ideal_ls, current_depth + 1) + return { + 'is_leaf': False, + 'value': None, + 'splitting_feature': splitting_feature, + 'left': left_tree, + 'right': right_tree, + 'index': None + } + def error_function(split_point, split_var, data, label): - data1 = [] - data2 = [] - for i in xrange(len(data)): - temp_dat = data[i] - if temp_dat <= split_point: - data1.append(label[i]) - else: - data2.append(label[i]) - return least_square(data1) + least_square(data2) - - -def make_prediction(tree, x, annotate = False): - if tree['is_leaf']: - if annotate: - print "At leaf, predicting %s" % tree['value'] - return tree['value'] - else: - # the splitting value of x. - split_feature_value = x[tree['splitting_feature'][0]] - if annotate: - print "Split on %s = %s" % (tree['splitting_feature'], split_feature_value) - if split_feature_value < tree['splitting_feature'][1]: - return make_prediction(tree['left'], x, annotate) - else: - return make_prediction(tree['right'], x, annotate) + data1 = [] + data2 = [] + for i in range(len(data)): + temp_dat = data[i] + if temp_dat <= split_point: + data1.append(label[i]) + else: + data2.append(label[i]) + return least_square(data1) + least_square(data2) + + +def make_prediction(tree, x, annotate=False): + if tree['is_leaf']: + if annotate: + print("At leaf, predicting %s" % tree['value']) + return tree['value'] + else: + # the splitting value of x. + split_feature_value = x[tree['splitting_feature'][0]] + if annotate: + print("Split on %s = %s" % (tree['splitting_feature'], split_feature_value)) + if split_feature_value < tree['splitting_feature'][1]: + return make_prediction(tree['left'], x, annotate) + else: + return make_prediction(tree['right'], x, annotate) -class RegressionTree: - def __init__(self, training_data, labels, max_depth=5, ideal_ls=100): - self.training_data = training_data - self.labels = labels - self.max_depth = max_depth - self.ideal_ls = ideal_ls - self.tree = None - - def fit(self): - global node_id - node_id = 0 - all_pos_split = {} - pool = Pool() - splitting_data = [self.training_data.iloc[:,col].tolist() for col in xrange(self.training_data.shape[1])] - cols = [col for col in xrange(self.training_data.shape[1])] - for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)): - all_pos_split[col] = dat - pool.close() - self.tree = create_tree(self.training_data, all_pos_split, self.labels, self.max_depth, self.ideal_ls) - - - def predict(self, test): - prediction = np.array([make_prediction(self.tree, x) for x in test]) - return prediction -if __name__ == '__main__': - #read in data, label - data = pd.read_excel("mlr06.xls") - test = [[478, 184, 40, 74, 11, 31], [1000,10000,10000,10000,10000,1000,100000]] - label = data['X7'] - del data['X7'] +class RegressionTree: + def __init__(self, training_data, labels, max_depth=5, ideal_ls=100): + self.training_data = training_data + self.labels = labels + self.max_depth = max_depth + self.ideal_ls = ideal_ls + self.tree = None + + def fit(self): + global node_id + node_id = 0 + all_pos_split = {} + pool = Pool() + splitting_data = [self.training_data.iloc[:, col].tolist() for col in range(self.training_data.shape[1])] + cols = [col for col in range(self.training_data.shape[1])] + for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)): + all_pos_split[col] = dat + pool.close() + self.tree = create_tree(self.training_data, all_pos_split, self.labels, self.max_depth, self.ideal_ls) + + def predict(self, test): + prediction = np.array([make_prediction(self.tree, x) for x in test]) + return prediction - model = RegressionTree(data, label) - model.fit() - print model.predict(test) +if __name__ == '__main__': + # read in data, label + data = pd.read_excel("mlr06.xls") + test = [[478, 184, 40, 74, 11, 31], [1000, 10000, 10000, 10000, 10000, 1000, 100000]] + label = data['X7'] + del data['X7'] + + model = RegressionTree(data, label) + model.fit() + print(model.predict(test)) diff --git a/lambdamart.py b/lambdamart.py index f67e66f..72106c3 100644 --- a/lambdamart.py +++ b/lambdamart.py @@ -1,356 +1,365 @@ +import pickle +from multiprocessing import Pool + import numpy as np -import math -import random -import copy +import pandas as pd from sklearn.tree import DecisionTreeRegressor -from multiprocessing import Pool + from RegressionTree import RegressionTree -import pandas as pd -import pickle -def dcg(scores): - """ - Returns the DCG value of the list of scores. - Parameters - ---------- - scores : list - Contains labels in a certain ranked order - - Returns - ------- - DCG_val: int - This is the value of the DCG on the given scores - """ - return np.sum([ - (np.power(2, scores[i]) - 1) / np.log2(i + 2) - for i in xrange(len(scores)) - ]) - -def dcg_k(scores, k): - """ - Returns the DCG value of the list of scores and truncates to k values. - Parameters - ---------- - scores : list - Contains labels in a certain ranked order - k : int - In the amount of values you want to only look at for computing DCG - - Returns - ------- - DCG_val: int - This is the value of the DCG on the given scores - """ - return np.sum([ - (np.power(2, scores[i]) - 1) / np.log2(i + 2) - for i in xrange(len(scores[:k])) - ]) - -def ideal_dcg(scores): - """ - Returns the Ideal DCG value of the list of scores. - Parameters - ---------- - scores : list - Contains labels in a certain ranked order - - Returns - ------- - Ideal_DCG_val: int - This is the value of the Ideal DCG on the given scores - """ - scores = [score for score in sorted(scores)[::-1]] - return dcg(scores) + +def dcg(scores: list): + """ + Returns the DCG value of the list of scores. + Parameters + ---------- + scores : list + Contains labels in a certain ranked order + + Returns + ------- + DCG_val: int + This is the value of the DCG on the given scores + """ + return np.sum([ + (np.power(2, scores[i]) - 1) / np.log2(i + 2) + for i in range(len(scores)) + ]) + + +def dcg_k(scores: list, k: int): + """ + Returns the DCG value of the list of scores and truncates to k values. + Parameters + ---------- + scores : list + Contains labels in a certain ranked order + k : int + In the amount of values you want to only look at for computing DCG + + Returns + ------- + DCG_val: int + This is the value of the DCG on the given scores + """ + return np.sum([ + (np.power(2, scores[i]) - 1) / np.log2(i + 2) + for i in range(len(scores[:k])) + ]) + + +def ideal_dcg(scores: list): + """ + Returns the Ideal DCG value of the list of scores. + Parameters + ---------- + scores : list + Contains labels in a certain ranked order + + Returns + ------- + Ideal_DCG_val: int + This is the value of the Ideal DCG on the given scores + """ + scores = [score for score in sorted(scores)[::-1]] + return dcg(scores) + def ideal_dcg_k(scores, k): - """ - Returns the Ideal DCG value of the list of scores and truncates to k values. - Parameters - ---------- - scores : list - Contains labels in a certain ranked order - k : int - In the amount of values you want to only look at for computing DCG - - Returns - ------- - Ideal_DCG_val: int - This is the value of the Ideal DCG on the given scores - """ - scores = [score for score in sorted(scores)[::-1]] - return dcg_k(scores, k) - -def single_dcg(scores, i, j): - """ - Returns the DCG value at a single point. - Parameters - ---------- - scores : list - Contains labels in a certain ranked order - i : int - This points to the ith value in scores - j : int - This sets the ith value in scores to be the jth rank - - Returns - ------- - Single_DCG: int - This is the value of the DCG at a single point - """ - return (np.power(2, scores[i]) - 1) / np.log2(j + 2) - -def compute_lambda(args): - """ - Returns the lambda and w values for a given query. - Parameters - ---------- - args : zipped value of true_scores, predicted_scores, good_ij_pairs, idcg, query_key - Contains a list of the true labels of documents, list of the predicted labels of documents, - i and j pairs where true_score[i] > true_score[j], idcg values, and query keys. - - Returns - ------- - lambdas : numpy array - This contains the calculated lambda values - w : numpy array - This contains the computed w values - query_key : int - This is the query id these values refer to - """ - - true_scores, predicted_scores, good_ij_pairs, idcg, query_key = args - num_docs = len(true_scores) - sorted_indexes = np.argsort(predicted_scores)[::-1] - rev_indexes = np.argsort(sorted_indexes) - true_scores = true_scores[sorted_indexes] - predicted_scores = predicted_scores[sorted_indexes] - - lambdas = np.zeros(num_docs) - w = np.zeros(num_docs) - - single_dcgs = {} - for i,j in good_ij_pairs: - if (i,i) not in single_dcgs: - single_dcgs[(i,i)] = single_dcg(true_scores, i, i) - single_dcgs[(i,j)] = single_dcg(true_scores, i, j) - if (j,j) not in single_dcgs: - single_dcgs[(j,j)] = single_dcg(true_scores, j, j) - single_dcgs[(j,i)] = single_dcg(true_scores, j, i) - - - for i,j in good_ij_pairs: - z_ndcg = abs(single_dcgs[(i,j)] - single_dcgs[(i,i)] + single_dcgs[(j,i)] - single_dcgs[(j,j)]) / idcg - rho = 1 / (1 + np.exp(predicted_scores[i] - predicted_scores[j])) - rho_complement = 1.0 - rho - lambda_val = z_ndcg * rho - lambdas[i] += lambda_val - lambdas[j] -= lambda_val - - w_val = rho * rho_complement * z_ndcg - w[i] += w_val - w[j] += w_val - - return lambdas[rev_indexes], w[rev_indexes], query_key + """ + Returns the Ideal DCG value of the list of scores and truncates to k values. + Parameters + ---------- + scores : list + Contains labels in a certain ranked order + k : int + In the amount of values you want to only look at for computing DCG + + Returns + ------- + Ideal_DCG_val: int + This is the value of the Ideal DCG on the given scores + """ + scores = [score for score in sorted(scores)[::-1]] + return dcg_k(scores, k) + + +def single_dcg(scores: list, i: int, j: int): + """ + Returns the DCG value at a single point. + Parameters + ---------- + scores : list + Contains labels in a certain ranked order + i : int + This points to the ith value in scores + j : int + This sets the ith value in scores to be the jth rank + + Returns + ------- + Single_DCG: int + This is the value of the DCG at a single point + """ + return (np.power(2, scores[i]) - 1) / np.log2(j + 2) + + +def compute_lambda(args: tuple): + """ + Returns the lambda and w values for a given query. + Parameters + ---------- + args : zipped value of true_scores, predicted_scores, good_ij_pairs, idcg, query_key + Contains a list of the true labels of documents, list of the predicted labels of documents, + i and j pairs where true_score[i] > true_score[j], idcg values, and query keys. + + Returns + ------- + lambdas : numpy array + This contains the calculated lambda values + w : numpy array + This contains the computed w values + query_key : int + This is the query id these values refer to + """ + + true_scores, predicted_scores, good_ij_pairs, idcg, query_key = args + num_docs = len(true_scores) + sorted_indexes = np.argsort(predicted_scores)[::-1] + rev_indexes = np.argsort(sorted_indexes) + true_scores = true_scores[sorted_indexes] + predicted_scores = predicted_scores[sorted_indexes] + + lambdas = np.zeros(num_docs) + w = np.zeros(num_docs) + + single_dcgs = {} + for i, j in good_ij_pairs: + if (i, i) not in single_dcgs: + single_dcgs[(i, i)] = single_dcg(true_scores, i, i) + single_dcgs[(i, j)] = single_dcg(true_scores, i, j) + if (j, j) not in single_dcgs: + single_dcgs[(j, j)] = single_dcg(true_scores, j, j) + single_dcgs[(j, i)] = single_dcg(true_scores, j, i) + + for i, j in good_ij_pairs: + z_ndcg = abs(single_dcgs[(i, j)] - single_dcgs[(i, i)] + single_dcgs[(j, i)] - single_dcgs[(j, j)]) / idcg + rho = 1 / (1 + np.exp(predicted_scores[i] - predicted_scores[j])) + rho_complement = 1.0 - rho + lambda_val = z_ndcg * rho + lambdas[i] += lambda_val + lambdas[j] -= lambda_val + + w_val = rho * rho_complement * z_ndcg + w[i] += w_val + w[j] += w_val + + return lambdas[rev_indexes], w[rev_indexes], query_key + def group_queries(training_data, qid_index): - """ - Returns a dictionary that groups the documents by their query ids. - Parameters - ---------- - training_data : Numpy array of lists - Contains a list of document information. Each document's format is [relevance score, query index, feature vector] - qid_index : int - This is the index where the qid is located in the training data - - Returns - ------- - query_indexes : dictionary - The keys were the different query ids and teh values were the indexes in the training data that are associated of those keys. - """ - query_indexes = {} - index = 0 - for record in training_data: - query_indexes.setdefault(record[qid_index], []) - query_indexes[record[qid_index]].append(index) - index += 1 - return query_indexes - -def get_pairs(scores): - """ - Returns pairs of indexes where the first value in the pair has a higher score than the second value in the pair. - Parameters - ---------- - scores : list of int - Contain a list of numbers - - Returns - ------- - query_pair : list of pairs - This contains a list of pairs of indexes in scores. - """ - - query_pair = [] - for query_scores in scores: - temp = sorted(query_scores, reverse=True) - pairs = [] - for i in xrange(len(temp)): - for j in xrange(len(temp)): - if temp[i] > temp[j]: - pairs.append((i,j)) - query_pair.append(pairs) - return query_pair + """ + Returns a dictionary that groups the documents by their query ids. + Parameters + ---------- + training_data : Numpy array of lists + Contains a list of document information. Each document's format is [relevance score, query index, feature vector] + qid_index : int + This is the index where the qid is located in the training data + + Returns + ------- + query_indexes : dictionary + The keys were the different query ids and teh values were the indexes in the training data that are associated of those keys. + """ + query_indexes = {} + index = 0 + for record in training_data: + query_indexes.setdefault(record[qid_index], []) + query_indexes[record[qid_index]].append(index) + index += 1 + return query_indexes + + +def get_pairs(scores: list): + """ + Returns pairs of indexes where the first value in the pair has a higher score than the second value in the pair. + Parameters + ---------- + scores : list of int + Contain a list of numbers + + Returns + ------- + query_pair : list of pairs + This contains a list of pairs of indexes in scores. + """ + + query_pair = [] + for query_scores in scores: + temp = sorted(query_scores, reverse=True) + pairs = [] + for i in range(len(temp)): + for j in range(len(temp)): + if temp[i] > temp[j]: + pairs.append((i, j)) + query_pair.append(pairs) + return query_pair + class LambdaMART: - def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1, tree_type='sklearn'): - """ - This is the constructor for the LambdaMART object. - Parameters - ---------- - training_data : list of int - Contain a list of numbers - number_of_trees : int (default: 5) - Number of trees LambdaMART goes through - learning_rate : float (default: 0.1) - Rate at which we update our prediction with each tree - tree_type : string (default: "sklearn") - Either "sklearn" for using Sklearn implementation of the tree of "original" - for using our implementation - """ - - if tree_type != 'sklearn' and tree_type != 'original': - raise ValueError('The "tree_type" must be "sklearn" or "original"') - self.training_data = training_data - self.number_of_trees = number_of_trees - self.learning_rate = learning_rate - self.trees = [] - self.tree_type = tree_type - - def fit(self): - """ - Fits the model on the training data. - """ - - predicted_scores = np.zeros(len(self.training_data)) - query_indexes = group_queries(self.training_data, 1) - query_keys = query_indexes.keys() - true_scores = [self.training_data[query_indexes[query], 0] for query in query_keys] - good_ij_pairs = get_pairs(true_scores) - tree_data = pd.DataFrame(self.training_data[:, 2:7]) - labels = self.training_data[:, 0] - - # ideal dcg calculation - idcg = [ideal_dcg(scores) for scores in true_scores] - - for k in xrange(self.number_of_trees): - print 'Tree %d' % (k) - lambdas = np.zeros(len(predicted_scores)) - w = np.zeros(len(predicted_scores)) - pred_scores = [predicted_scores[query_indexes[query]] for query in query_keys] - - pool = Pool() - for lambda_val, w_val, query_key in pool.map(compute_lambda, zip(true_scores, pred_scores, good_ij_pairs, idcg, query_keys), chunksize=1): - indexes = query_indexes[query_key] - lambdas[indexes] = lambda_val - w[indexes] = w_val - pool.close() - - if self.tree_type == 'sklearn': - # Sklearn implementation of the tree - tree = DecisionTreeRegressor(max_depth=50) - tree.fit(self.training_data[:,2:], lambdas) - self.trees.append(tree) - prediction = tree.predict(self.training_data[:,2:]) - predicted_scores += prediction * self.learning_rate - elif self.tree_type == 'original': - # Our implementation of the tree - tree = RegressionTree(tree_data, lambdas, max_depth=10, ideal_ls= 0.001) - tree.fit() - prediction = tree.predict(self.training_data[:,2:]) - predicted_scores += prediction * self.learning_rate - - def predict(self, data): - """ - Predicts the scores for the test dataset. - Parameters - ---------- - data : Numpy array of documents - Numpy array of documents with each document's format is [query index, feature vector] - - Returns - ------- - predicted_scores : Numpy array of scores - This contains an array or the predicted scores for the documents. - """ - data = np.array(data) - query_indexes = group_queries(data, 0) - predicted_scores = np.zeros(len(data)) - for query in query_indexes: - results = np.zeros(len(query_indexes[query])) - for tree in self.trees: - results += self.learning_rate * tree.predict(data[query_indexes[query], 1:]) - predicted_scores[query_indexes[query]] = results - return predicted_scores - - def validate(self, data, k): - """ - Predicts the scores for the test dataset and calculates the NDCG value. - Parameters - ---------- - data : Numpy array of documents - Numpy array of documents with each document's format is [relevance score, query index, feature vector] - k : int - this is used to compute the NDCG@k - - Returns - ------- - average_ndcg : float - This is the average NDCG value of all the queries - predicted_scores : Numpy array of scores - This contains an array or the predicted scores for the documents. - """ - data = np.array(data) - query_indexes = group_queries(data, 1) - average_ndcg = [] - predicted_scores = np.zeros(len(data)) - for query in query_indexes: - results = np.zeros(len(query_indexes[query])) - for tree in self.trees: - results += self.learning_rate * tree.predict(data[query_indexes[query], 2:]) - predicted_sorted_indexes = np.argsort(results)[::-1] - t_results = data[query_indexes[query], 0] - t_results = t_results[predicted_sorted_indexes] - predicted_scores[query_indexes[query]] = results - dcg_val = dcg_k(t_results, k) - idcg_val = ideal_dcg_k(t_results, k) - ndcg_val = (dcg_val / idcg_val) - average_ndcg.append(ndcg_val) - average_ndcg = np.nanmean(average_ndcg) - return average_ndcg, predicted_scores - - def save(self, fname): - """ - Saves the model into a ".lmart" file with the name given as a parameter. - Parameters - ---------- - fname : string - Filename of the file you want to save - - """ - pickle.dump(self, open('%s.lmart' % (fname), "wb"), protocol=2) - - def load(self, fname): - """ - Loads the model from the ".lmart" file given as a parameter. - Parameters - ---------- - fname : string - Filename of the file you want to load - - """ - model = pickle.load(open(fname , "rb")) - self.training_data = model.training_data - self.number_of_trees = model.number_of_trees - self.tree_type = model.tree_type - self.learning_rate = model.learning_rate - self.trees = model.trees \ No newline at end of file + def __init__(self, training_data: list = None, number_of_trees=5, learning_rate=0.1, tree_type='sklearn'): + """ + This is the constructor for the LambdaMART object. + Parameters + ---------- + training_data : list of int + Contain a list of numbers + number_of_trees : int (default: 5) + Number of trees LambdaMART goes through + learning_rate : float (default: 0.1) + Rate at which we update our prediction with each tree + tree_type : string (default: "sklearn") + Either "sklearn" for using Sklearn implementation of the tree of "original" + for using our implementation + """ + + if tree_type != 'sklearn' and tree_type != 'original': + raise ValueError('The "tree_type" must be "sklearn" or "original"') + self.training_data: list = training_data + self.number_of_trees = number_of_trees + self.learning_rate = learning_rate + self.trees = [] + self.tree_type = tree_type + + def fit(self): + """ + Fits the model on the training data. + """ + + predicted_scores = np.zeros(len(self.training_data)) + query_indexes = group_queries(self.training_data, 1) + query_keys = query_indexes.keys() + true_scores = [self.training_data[query_indexes[query], 0] for query in query_keys] + good_ij_pairs = get_pairs(true_scores) + tree_data = pd.DataFrame(self.training_data[:, 2:7]) + labels = self.training_data[:, 0] + + # ideal dcg calculation + idcg = [ideal_dcg(scores) for scores in true_scores] + + for k in range(self.number_of_trees): + print('Tree %d' % (k)) + lambdas = np.zeros(len(predicted_scores)) + w = np.zeros(len(predicted_scores)) + pred_scores = [predicted_scores[query_indexes[query]] for query in query_keys] + + pool = Pool() + for lambda_val, w_val, query_key in pool.map(compute_lambda, + zip(true_scores, pred_scores, good_ij_pairs, idcg, query_keys), + chunksize=1): + indexes = query_indexes[query_key] + lambdas[indexes] = lambda_val + w[indexes] = w_val + pool.close() + + if self.tree_type == 'sklearn': + # Sklearn implementation of the tree + tree = DecisionTreeRegressor(max_depth=50) + tree.fit(self.training_data[:, 2:], lambdas) + self.trees.append(tree) + prediction = tree.predict(self.training_data[:, 2:]) + predicted_scores += prediction * self.learning_rate + elif self.tree_type == 'original': + # Our implementation of the tree + tree = RegressionTree(tree_data, lambdas, max_depth=10, ideal_ls=0.001) + tree.fit() + prediction = tree.predict(self.training_data[:, 2:]) + predicted_scores += prediction * self.learning_rate + + def predict(self, data): + """ + Predicts the scores for the test dataset. + Parameters + ---------- + data : Numpy array of documents + Numpy array of documents with each document's format is [query index, feature vector] + + Returns + ------- + predicted_scores : Numpy array of scores + This contains an array or the predicted scores for the documents. + """ + data = np.array(data) + query_indexes = group_queries(data, 0) + predicted_scores = np.zeros(len(data)) + for query in query_indexes: + results = np.zeros(len(query_indexes[query])) + for tree in self.trees: + results += self.learning_rate * tree.predict(data[query_indexes[query], 1:]) + predicted_scores[query_indexes[query]] = results + return predicted_scores + + def validate(self, data, k): + """ + Predicts the scores for the test dataset and calculates the NDCG value. + Parameters + ---------- + data : Numpy array of documents + Numpy array of documents with each document's format is [relevance score, query index, feature vector] + k : int + this is used to compute the NDCG@k + + Returns + ------- + average_ndcg : float + This is the average NDCG value of all the queries + predicted_scores : Numpy array of scores + This contains an array or the predicted scores for the documents. + """ + data = np.array(data) + query_indexes = group_queries(data, 1) + average_ndcg = [] + predicted_scores = np.zeros(len(data)) + for query in query_indexes: + results = np.zeros(len(query_indexes[query])) + for tree in self.trees: + results += self.learning_rate * tree.predict(data[query_indexes[query], 2:]) + predicted_sorted_indexes = np.argsort(results)[::-1] + t_results = data[query_indexes[query], 0] + t_results = t_results[predicted_sorted_indexes] + predicted_scores[query_indexes[query]] = results + dcg_val = dcg_k(t_results, k) + idcg_val = ideal_dcg_k(t_results, k) + ndcg_val = (dcg_val / idcg_val) + average_ndcg.append(ndcg_val) + average_ndcg = np.nanmean(average_ndcg) + return average_ndcg, predicted_scores + + def save(self, fname): + """ + Saves the model into a ".lmart" file with the name given as a parameter. + Parameters + ---------- + fname : string + Filename of the file you want to save + + """ + pickle.dump(self, open('%s.lmart' % (fname), "wb"), protocol=2) + + def load(self, fname): + """ + Loads the model from the ".lmart" file given as a parameter. + Parameters + ---------- + fname : string + Filename of the file you want to load + + """ + model = pickle.load(open(fname, "rb")) + self.training_data = model.training_data + self.number_of_trees = model.number_of_trees + self.tree_type = model.tree_type + self.learning_rate = model.learning_rate + self.trees = model.trees diff --git a/test.py b/test.py index 600ceea..7ffe204 100644 --- a/test.py +++ b/test.py @@ -2,78 +2,82 @@ import numpy as np import pandas as pd + def get_data(file_loc): - f = open(file_loc, 'r') - data = [] - for line in f: - new_arr = [] - arr = line.split(' #')[0].split() - score = arr[0] - q_id = arr[1].split(':')[1] - new_arr.append(int(score)) - new_arr.append(int(q_id)) - arr = arr[2:] - for el in arr: - new_arr.append(float(el.split(':')[1])) - data.append(new_arr) - f.close() - return np.array(data) + f = open(file_loc, 'r') + data = [] + for line in f: + new_arr = [] + arr = line.split(' #')[0].split() + score = arr[0] + q_id = arr[1].split(':')[1] + new_arr.append(int(score)) + new_arr.append(int(q_id)) + arr = arr[2:] + for el in arr: + new_arr.append(float(el.split(':')[1])) + data.append(new_arr) + f.close() + return np.array(data) + def group_queries(data): - query_indexes = {} - index = 0 - for record in data: - query_indexes.setdefault(record[1], []) - query_indexes[record[1]].append(index) - index += 1 - return query_indexes + query_indexes = {} + index = 0 + for record in data: + query_indexes.setdefault(record[1], []) + query_indexes[record[1]].append(index) + index += 1 + return query_indexes def main(): - total_ndcg = 0.0 - for i in [1,2,3,4,5]: - print 'start Fold ' + str(i) - training_data = get_data('Fold%d/train.txt' % (i)) - test_data = get_data('Fold%d/test.txt' % (i)) - model = LambdaMART(training_data, 300, 0.001, 'sklearn') - model.fit() - model.save('lambdamart_model_%d' % (i)) - # model = LambdaMART() - # model.load('lambdamart_model.lmart') - average_ndcg, predicted_scores = model.validate(test_data, 10) - print average_ndcg - total_ndcg += average_ndcg - total_ndcg /= 5.0 - print 'Original average ndcg at 10 is: ' + str(total_ndcg) + total_ndcg = 0.0 + #for i in [1, 2, 3, 4, 5]: + for i, fold in enumerate(['example_data']): + print('start Fold ' + str(i)) + training_data = get_data('%s/train.txt' % (fold)) + test_data = get_data('%s/test.txt' % (fold)) + model = LambdaMART(training_data, 300, 0.001, 'sklearn') + model.fit() + model.save('lambdamart_model_%d' % (i)) + # model = LambdaMART() + # model.load('lambdamart_model.lmart') + average_ndcg, predicted_scores = model.validate(test_data, 10) + print(average_ndcg) + total_ndcg += average_ndcg + total_ndcg /= 5.0 + print('Original average ndcg at 10 is: ' + str(total_ndcg)) + + total_ndcg = 0.0 + for i in [1, 2, 3, 4, 5]: + print('start Fold ' + str(i)) + training_data = get_data('Fold%d/train.txt' % (i)) + test_data = get_data('Fold%d/test.txt' % (i)) + model = LambdaMART(training_data, 300, 0.001, 'original') + model.fit() + model.save('lambdamart_model_sklearn_%d' % (i)) + # model = LambdaMART() + # model.load('lambdamart_model.lmart') + average_ndcg, predicted_scores = model.validate(test_data, 10) + print(average_ndcg) + total_ndcg += average_ndcg + total_ndcg /= 5.0 + print('Sklearn average ndcg at 10 is: ' + str(total_ndcg)) - total_ndcg = 0.0 - for i in [1,2,3,4,5]: - print 'start Fold ' + str(i) - training_data = get_data('Fold%d/train.txt' % (i)) - test_data = get_data('Fold%d/test.txt' % (i)) - model = LambdaMART(training_data, 300, 0.001, 'original') - model.fit() - model.save('lambdamart_model_sklearn_%d' % (i)) - # model = LambdaMART() - # model.load('lambdamart_model.lmart') - average_ndcg, predicted_scores = model.validate(test_data, 10) - print average_ndcg - total_ndcg += average_ndcg - total_ndcg /= 5.0 - print 'Sklearn average ndcg at 10 is: ' + str(total_ndcg) - # print 'NDCG score: %f' % (average_ndcg) - # query_indexes = group_queries(test_data) - # index = query_indexes.keys()[0] - # testdata = [test_data[i][0] for i in query_indexes[index]] - # pred = [predicted_scores[i] for i in query_indexes[index]] - # output = pd.DataFrame({"True label": testdata, "prediction": pred}) - # output = output.sort('prediction',ascending = False) - # output.to_csv("outdemo.csv", index =False) - # print output - # # for i in query_indexes[index]: - # # print test_data[i][0], predicted_scores[i] +# print 'NDCG score: %f' % (average_ndcg) +# query_indexes = group_queries(test_data) +# index = query_indexes.keys()[0] +# testdata = [test_data[i][0] for i in query_indexes[index]] +# pred = [predicted_scores[i] for i in query_indexes[index]] +# output = pd.DataFrame({"True label": testdata, "prediction": pred}) +# output = output.sort('prediction',ascending = False) +# output.to_csv("outdemo.csv", index =False) +# print output +# # for i in query_indexes[index]: +# # print test_data[i][0], predicted_scores[i] if __name__ == '__main__': - main() \ No newline at end of file + main()