-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathknn_predict.py
executable file
·151 lines (122 loc) · 4.73 KB
/
knn_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/python3.6
""" Makes predictions basing on distances, calculated by KNN. """
import os
import sys
from glob import glob
from typing import DefaultDict, List, Set
from collections import defaultdict
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split
from scipy.stats import describe
from tqdm import tqdm
from debug import dprint
def GAP(predicts: np.ndarray, confs: np.ndarray, targets: np.ndarray) -> float:
''' Computes GAP@1 '''
if len(targets.shape) != 1:
dprint(targets.shape)
assert False
assert predicts.shape == confs.shape
indices = np.argsort(-confs)
res, true_pos = 0.0, 0
num_predicts_per_sample = confs.shape[1]
num_targets = len(targets)
predicts = predicts.flatten()
confs = confs.flatten()
targets = np.repeat(targets.reshape(-1, 1), num_predicts_per_sample, axis=1)
targets = targets.flatten()
dprint(predicts)
dprint(targets)
sorting_idx = np.argsort(-confs)
predicts = predicts[sorting_idx]
targets = targets[sorting_idx]
for i, (p, t) in enumerate(zip(predicts, targets)):
rel = int(p == t)
true_pos += rel
res += true_pos / (i + 1) * rel
res /= num_targets # TODO: incorrect, not all test images depict landmarks
return res
if __name__ == "__main__":
if len(sys.argv) != 3 or sys.argv[1] not in ['--val', '--test']:
print(f'usage: {sys.argv[0]} --val|--test <distances.npz>')
sys.exit()
''' Algorithm:
1. define level-2 train and validation sets
2. for every sample from validation set, find K nearest samples from the train set
3. make a prediction about classes
4. calculate the metric
5. take full train set
6. for every sample from the test set, find K nearest samples from the full train set
7. make a prediction
8. generate submission
'''
predict_test = sys.argv[1] == '--test'
# load distances info
dist_file = np.load(sys.argv[2], allow_pickle=True)
distances, indices = dist_file['distances'], dist_file['indices']
dprint(distances.shape)
dprint(indices.shape)
dprint(np.max(indices.flatten()))
if not predict_test:
distances = np.delete(distances, 0, axis=1)
indices = np.delete(indices, 0, axis=1)
# load dataframe
full_train_df = pd.read_csv('../data/train.csv')
full_train_df.drop(columns='url', inplace=True)
# train_df = pd.read_csv('../data/splits/10_samples_92740_classes_fold_0_train.csv')
# val_dataset_mask = ~full_train_df.id.isin(train_df.id)
# knn_train_df = full_train_df.loc[val_dataset_mask]
# dprint(knn_train_df.shape)
if predict_test:
df = pd.read_csv('../data/test.csv')
# df = df.loc[df.id.apply(lambda img: os.path.exists(os.path.join(
# f'../data/test/{img}.jpg')))]
print('test df after filtering', df.shape)
else:
df = full_train_df
# make a prediction about classes
num_predicts = indices.shape[1]
predicts = np.zeros((len(df), num_predicts), dtype=int)
confs = np.zeros((len(df), num_predicts))
for i in range(num_predicts):
idx = indices[:, i]
predicts[:, i] = full_train_df.iloc[idx, 1]
confs[:, i] = distances[:, i]
# print('calculating class centroids')
# best_predicts, best_confs = [], []
#
# for i in tqdm(range(confs.shape[0])):
# preds: DefaultDict[int, List[float]] = defaultdict(list)
#
# for pred, conf in zip(predicts[i], confs[i]):
# preds[pred].append(conf)
#
# for j, (pred, conf) in enumerate(preds.items()):
# predicts[i, j] = pred
# confs[i, j] = np.mean(conf)
#
# centroids = [(class_, np.mean(distances)) for class_, distances in preds.items()]
# best = max(centroids, key=lambda x: x[1])
#
# best_predicts.append(best[0])
# best_confs.append(best[1])
#
# predicts = np.array(best_predicts).reshape(-1, 1)
# confs = np.array(best_confs).reshape(-1, 1)
predicts = predicts[:, 0].reshape(-1, 1)
confs = confs[:, 0].reshape(-1, 1)
if not predict_test:
print('calculating GAP')
gap = GAP(predicts, confs, df.landmark_id.values)
dprint(gap)
else:
print('generating submission')
sub = df
combine = lambda t: " ".join(f'{lm} {conf}' for lm, conf in zip(t[0], t[1]) if conf)
sub['landmarks'] = list(map(combine, zip(predicts, confs)))
sample_sub = pd.read_csv('../data/recognition_sample_submission.csv')
sample_sub = sample_sub.set_index('id')
sub = sub.set_index('id')
sample_sub.update(sub)
name = os.path.splitext(os.path.basename(sys.argv[2]))[0]
sample_sub.to_csv(f'{name}.csv')