-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathensemble_linear_train.py
executable file
·129 lines (96 loc) · 3.93 KB
/
ensemble_linear_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/python3.6
import os
import pickle
import re
import sys
import yaml
from glob import glob
from collections import OrderedDict
from typing import Any, List
import numpy as np
import pandas as pd
from scipy import optimize
from scipy.stats import describe
from tqdm import tqdm
from sklearn.metrics import fbeta_score
from debug import dprint
IN_KERNEL = os.environ.get('KAGGLE_WORKING_DIR') is not None
INPUT_PATH = '../input/imet-2019-fgvc6/' if IN_KERNEL else '../input/'
NUM_ATTEMPTS = 100
NUM_FOLDS = 5
NUM_CLASSES = 1103
THRESHOLDS_PATH = '../yml/' if not IN_KERNEL else '../input/imet-yaml/yml/'
ADD_THRESHOLD = False
def parse_labels(s: str) -> np.array:
res = np.zeros(NUM_CLASSES)
res[list(map(int, s.split()))] = 1
return res
if __name__ == '__main__':
np.set_printoptions(linewidth=120)
if len(sys.argv) < 4:
print(f'usage: {sys.argv[0]} result.txt predict1.npy ...')
sys.exit()
level2_fold = 0
# load data
fold_num = np.load('folds.npy')
train_df = pd.read_csv(INPUT_PATH + 'train.csv')
all_labels = np.vstack(list(map(parse_labels, train_df.attribute_ids)))
# build dataset
all_predicts_list, all_thresholds = [], []
predicts = sorted(sys.argv[2:])
for filename in predicts:
assert 'level1_train_' in filename
m = re.match(r'(.*)_f(\d)_e\d+.*\.npy', filename)
assert m
model_path = m.group(1)
predict = np.zeros((train_df.shape[0], NUM_CLASSES))
for fold in range(NUM_FOLDS):
filenames = glob(f'{model_path}_f{fold}_*.npy')
if len(filenames) != 1:
dprint(filenames)
assert False # the model must be unique in this fold
filename = filenames[0]
print('reading', filename)
# load data
data = np.load(filename)
if ADD_THRESHOLD:
# read threshold
filename = os.path.basename(filename)
assert filename.startswith('level1_train_') and filename.endswith('.npy')
with open(os.path.join(THRESHOLDS_PATH, filename[13:-4] + '.yml')) as f:
threshold = yaml.load(f, Loader=yaml.SafeLoader)['threshold']
all_thresholds.append(threshold)
data = data + threshold
if np.min(data) < 0 or np.max(data) > 1:
print('invalid range of data:', describe(predict.flatten()))
assert False
predict[fold_num == fold] = data
all_predicts_list.append(predict)
all_predicts = np.dstack(all_predicts_list)
dprint(all_predicts.shape)
dprint(all_labels.shape)
gold_threshold = np.mean(all_thresholds) if ADD_THRESHOLD else 0
for class_ in tqdm(range(NUM_CLASSES)):
x_train = all_predicts[:, class_]
y_train = all_labels[:, class_]
dims = x_train.shape[1]
def loss_function(weights: np.ndarray) -> float:
y_pred = np.matmul(x_train, weights[:-1])
y_pred += weights[-1]
y_pred = (y_pred > gold_threshold).astype(int)
if np.sum(y_pred) == 0:
res = 0
else:
res = fbeta_score(y_train, y_pred, beta=2)
return -res
weights = optimize.minimize(loss_function,
[1 / dims] * dims + [0],
method= 'Nelder-Mead', #'SLSQP',
# constraints=({'type': 'eq','fun': lambda w: 1-sum(w)}),
# bounds=[(0.0, 1.0)] * len(level1_train_predicts),
options = {'ftol':1e-10},
)['x']
best_score = -loss_function(weights)
# print('class', class_, 'weights', weights, 'f2', best_score)
with open(sys.argv[1], 'a') as f:
f.write(f'class={class_} weights={weights} f2={best_score}\n')