-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
81 lines (64 loc) · 2.42 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
import csv
def load_csv_data(data_path, sub_sample=False):
"""Loads data and returns y (class labels), tX (features) and ids (event ids)"""
y = np.genfromtxt(data_path, delimiter=",", skip_header=1, dtype=str, usecols=1)
x = np.genfromtxt(data_path, delimiter=",", skip_header=1)
ids = x[:, 0].astype(np.int)
input_data = x[:, 2:]
# convert class labels from strings to binary (-1,1)
yb = np.ones(len(y))
yb[np.where(y == 'b')] = -1
# sub-sample
if sub_sample:
yb = yb[::50]
input_data = input_data[::50]
ids = ids[::50]
return yb, input_data, ids
def train_test_split(data, y, test_size=0.3, random_state=23):
"""Gets train, test separation.
Parameters
----------
data : np.array
numpy array of shape (n, m)
n - number of objects
m - number of features
y: np.array
numpy array of shape (n, ) of labels
test_size : float from 0 to 1, optional (default=0.3)
Fraction of the data to be test part.
random_state : int or None, optional (default=23)
Random state. If None then no shuffling.
Returns
-------
arrays with train and test data and targets
"""
dataset = np.concatenate([data, y[:, None]], axis=1)
if random_state is not None:
np.random.seed(random_state)
np.random.shuffle(dataset)
sep = int(data.shape[0] * (1 - test_size))
X_train = dataset[:sep, :-1]
y_train = dataset[:sep, -1]
X_test = dataset[sep:, :-1]
y_test = dataset[sep:, -1]
return X_train, y_train, X_test, y_test
def predict_labels(data, weights):
"""Generates class predictions given weights, and a test data matrix"""
y_pred = np.dot(data, weights)
y_pred[np.where(y_pred <= 0)] = -1
y_pred[np.where(y_pred > 0)] = 1
return y_pred
def create_csv_submission(ids, y_pred, name):
"""
Creates an output file in .csv format for submission to Kaggle or AIcrowd
Arguments: ids (event ids associated with each prediction)
y_pred (predicted class labels)
name (string name of .csv output file to be created)
"""
with open(name, 'w') as csvfile:
fieldnames = ['Id', 'Prediction']
writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
writer.writeheader()
for r1, r2 in zip(ids, y_pred):
writer.writerow({'Id': int(r1), 'Prediction': int(r2)})