-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen-metafeatures.py
163 lines (127 loc) · 5.38 KB
/
gen-metafeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# FILE: gen-metafeatures.py
# AUTHOR: Julia Sloan
# DATE: Jan. 2021
import pandas as pd
import numpy as np
from copy import deepcopy
import scipy.special as sp
import sys
import os
# constant to be used as result of division by 0
BIG_M = int(sys.maxsize / 10**8)
# determine the number of final metafeatures
# note num_cols = num features
def get_num_metas(num_cols, is_mul, is_div, is_add, is_sub):
num_metas = 0
if is_mul:
num_metas += int(sp.comb(num_cols, 2) + num_cols)
if is_div: # account for both num/dem combinations, (i.e. a/b and b/a)
num_metas += 2 * int(sp.comb(num_cols, 2))
if is_add:
num_metas += int(sp.comb(num_cols, 2))
if is_sub:
num_metas += int(sp.comb(num_cols, 2))
return num_metas
# adds the specified line to the beginning of the file
def prepend_line(file, line):
dummy = file + '.dum'
# open original file in read mode and dummy in write mode
with open(file, 'r') as read_obj, open(dummy, 'w') as write_obj:
# write line to dummy file
write_obj.write(line + '\n')
# add lines from original file to dummy
for l in read_obj:
write_obj.write(l)
# remove original file
os.remove(file)
# rename dummy file as original file
os.rename(dummy, file)
# generate .csv to output
def make_csv(res, col_names, csv_name):
# convert 2D array to dataframe and add column titles
final = pd.DataFrame(res, columns=col_names)
# convert the result 2D array to a .csv file
final.to_csv(csv_name, index=False)
# generate .in to output (first line with num samples and num features)
def make_in(res, in_name):
# gen first line for .in file (num samples, num features)
num_samples = np.shape(res)[0]
num_feats_all = np.shape(res)[1]
# save res as text file
np.savetxt(in_name, res, delimiter="\t")
# add first line (num samples, num features) to text file
prepend_line(in_name, str(num_samples) + '\t' + str(num_feats_all))
# access command-line arguments (operations)
file = sys.argv[1]
ops = sys.argv[2:]
is_mul = 'mul' in ops
is_div = 'div' in ops
is_add = 'add' in ops
is_sub = 'sub' in ops
# get data from spreadsheet
df = pd.read_csv(file)
# create list of feature names, excluding the target (at column 0)
names = list(df.columns.values)
feat_names = names[1:]
# convert dataframe to np array, isolate target values from features
arr = df.to_numpy()
# get first column (the target values)
target_vals = np.array([arr[:, 0]]).T
# get all features as a 2D np array
feats = arr[:, 1:]
# copy array of features to initialize result array
res = deepcopy(feats)
num_rows = np.shape(feats)[0]
num_cols = np.shape(feats)[1]# - 1 # remove 1 because of target col
# get number of metafeatures based on inputs
num_metas = get_num_metas(num_cols, is_mul, is_div, is_add, is_sub)
# initialize metafeatures np array
metas = np.zeros((num_rows, num_metas))
# go through all combos of columns, adding to the metafeatures array
i = 0
for c1 in range(num_cols):
for c2 in range(c1, num_cols):
if is_mul:
names.append(str(feat_names[c1]) + "*" + str(feat_names[c2])) # generate metafeature names
metas[:, i] = (feats[:, c1] * feats[:, c2]) # generate metafeature values
i += 1
if is_div:
if not (c1 == c2): # exclude case where same features
names.append(str(feat_names[c1]) + "/" + str(feat_names[c2])) # generate metafeature names
if (np.all(feats[:, c2] != 0)):
metas[:, i] = (feats[:, c1] / feats[:, c2]) # generate metafeature values
else: # handle division by 0
for row in range(num_rows):
if feats[row, c2] == 0: # division by 0 case
metas[row, i] = BIG_M
else: # no division by 0 for this element
metas[row, i] = (feats[row, c1] / feats[row, c2])
i += 1
names.append(str(feat_names[c2]) + "/" + str(feat_names[c1]))
if (np.all(feats[:, c1] != 0)):
metas[:, i] = (feats[:, c2] / feats[:, c1])
else: # handle division by 0
for row in range(num_rows):
if feats[row, c1] == 0: # division by 0 case
metas[row, i] = BIG_M
else: # no division by 0 for this element
metas[row, i] = (feats[row, c2] / feats[row, c1])
i += 1
if is_add:
if not (c1 == c2):
names.append(str(feat_names[c1]) + "+" + str(feat_names[c2])) # generate metafeature names
metas[:, i] = (feats[:, c1] + feats[:, c2]) # generate metafeature values
i += 1
if is_sub:
if not (c1 == c2):
names.append(str(feat_names[c1]) + "-" + str(feat_names[c2])) # generate metafeature names
metas[:, i] = (feats[:, c1] - feats[:, c2]) # generate metafeature values
i += 1
# add target function back to first column of result
res = np.hstack((target_vals, res))
# add metafeatures to end of result array
res = np.hstack((res, metas))
# create .csv file with metafeatures
make_csv(res, names, file[:-4] + '-METAFEATURES.csv')
# output message to tell user program was successful
print("\nDone! Check directory of " + file + " for output")