-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
executable file
·58 lines (45 loc) · 1.63 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from sklearn import preprocessing
from sklearn import ensemble
from sklearn import metrics
import pandas as pd
import numpy as np
import os
import joblib
from . import dispatcher
TRAINING_DATA = os.environ.get("TRAINING_DATA")
TEST_DATA = os.environ.get("TEST_DATA")
FOLD = int(os.environ.get("FOLD"))
MODEL = os.environ.get("MODEL")
FOLD_MAPPING = {
0 : [1, 2, 3, 4],
1 : [0, 2, 3, 4],
2 : [0, 1, 3, 4],
3 : [0, 1, 2, 4],
4 : [0, 1, 2, 3]
}
if __name__== '__main__':
df = pd.read_csv(TRAINING_DATA)
df_test = pd.read_csv(TEST_DATA)
train_df = df[df.kfold.isin(FOLD_MAPPING.get(FOLD))]
valid_df = df[df.kfold==FOLD]
ytrain = train_df.target.values
yvalid = valid_df.target.values
train_df = train_df.drop(["id", "target", "kfold"], axis=1)
valid_df = valid_df.drop(["id", "target", "kfold"], axis=1)
valid_df = valid_df[train_df.columns]
label_encoders = {}
for c in train_df.columns:
lbl = preprocessing.LabelEncoder()
lbl.fit(train_df[c].values.tolist() + valid_df[c].values.tolist()+df_test[c].values.tolist())
train_df.loc[:, c] = lbl.transform(train_df[c].values.tolist())
valid_df.loc[:, c] = lbl.transform(valid_df[c].values.tolist())
label_encoders[c] = lbl
#data is ready
#training
clf = dispatcher.MODELS[MODEL]
clf.fit(train_df, ytrain)
pred = clf.predict_proba(valid_df)[:, 1]
print(metrics.roc_auc_score(yvalid,pred))
joblib.dump(label_encoders, f"models/{MODEL}_{FOLD}_label_encoders.pkl")
joblib.dump(clf, f"models/{MODEL}_{FOLD}.pkl")
joblib.dump(train_df.columns, f"models/{MODEL}_{FOLD}_columns.pkl")