-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path015_SPLITTING_DATA.py
37 lines (25 loc) · 1.2 KB
/
015_SPLITTING_DATA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
exec(open("Utils.py").read(), globals())
dir_data = 'DATA/CLASSIFICATION/'
data = pd.read_csv( dir_data + "dataset.csv" )
SEED = 741
try:
SEED
except NameError:
SEED = 123
print 'SEED does not exists and will be set automatically to', SEED
else:
print 'The SEED is', SEED
print 'The dimension of complete dataset is', data.shape
dir_dest = dir_data + str(SEED) +'/'
create_dir( dir_dest )
variable_sub_dataset, modeling_dataset = train_test_split( data, test_size = 0.9,
random_state = SEED)
variable_sub_dataset.to_csv( dir_dest + 'pre_training_set.csv', index = False)
#modeling_dataset.to_csv( dir_data + 'modeling_dataset_' + SEED +'.csv', index = False)
training_data, test_set = train_test_split( data, test_size = 0.2,
random_state = SEED)
training_set, validation_set = train_test_split( training_data, test_size = 0.1,
random_state = SEED)
training_set.to_csv( dir_dest + 'training_set.csv', index = False)
validation_set.to_csv( dir_dest + 'validation_set.csv', index = False)
test_set.to_csv( dir_dest + 'test_set.csv', index = False)