-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgeneralize.py
109 lines (99 loc) · 3.42 KB
/
generalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import collections
# Python code to remove duplicate elements
def RemDup(lis):
final_list = []
for i in lis:
if i not in final_list and i!=' ?':
final_list.append(i)
return final_list
#Hierarchical hash tables
HHTCountry={1 : {" North America" : [" United-States"," Puerto-Rico"," Canada",
" Outlying-US(Guam-USVI-etc)"," Cuba"," Honduras"," Jamaica"," Mexico"," Dominican-Republic"," Haiti",
" Guatemala"," Nicaragua"," El-Salvador"," Trinadad&Tobago"],
" South America" : [" Ecuador"," Peru"," Columbia"],
" Europe" : [" England"," Germany"," Greece"," Italy"," Poland"," Portugal"," Ireland",
" France"," Hungary"," Scotland"," Yugoslavia"," Holand-Netherlands"],
" Asia" : [" India"," Japan"," South"," China"," Iran"," Philippines"," Vietnam",
" Laos", " Taiwan"," Thailand"," Hong"," Cambodia"]},
2 : {" *" : [" North America"," South America"," Asia"," Europe"]}}
HHTWorkclass={1 : {" Charity" : [" Without-pay"], " Unemployed" : [" Never-worked"],
" Entrepreneur" : [" Private"," Self-emp-not-inc", " Self-emp-inc"],
" Central-gov" : [" Federal-gov"," State-gov"], " Territory-gov" : [" Local-gov"]},
2 : {" Non-gov" : [" Charity"," Unemployed"," Entrepreneur"],
" gov" : [" Central-gov"," Territory-gov"]},
3 : {" *" : [" Non-gov"," gov"]}}
HHTMarital={1 : {" Married" : [" Married-civ-spouse"," Separated"," Married-spouse-absent"," Married-AF-spouse"],
" Unmarried" : [" Divorced"," Never-married"," Widowed"]},
2 : { " *" : [" Married"," Unmarried"]}}
HHTSex={1 : {" *" : [" Male"," Female"]}}
def generalizeCountry(data):
Ulis=[]
for i in data:
if i["nativeCountry"] in Ulis:
continue
else:
Ulis.append(i["nativeCountry"])
i=1
while(len(Ulis)>1):
for j in HHTCountry[i]:
for k in Ulis:
if k in HHTCountry[i][j]:
Ulis[Ulis.index(k)]=j
Ulis=RemDup(Ulis)
i=i+1
gDataCoun=Ulis[0]
return gDataCoun
def generalizeWorkclass(data):
Ulis=[]
for i in data:
if i["workclass"] in Ulis:
continue
else:
Ulis.append(i["workclass"])
i=1
Ulis=RemDup(Ulis)
while(len(Ulis)>1):
for j in HHTWorkclass[i]:
for k in Ulis:
if k in HHTWorkclass[i][j]:
Ulis[Ulis.index(k)]=j
Ulis=RemDup(Ulis)
i=i+1
gDataWorkclass=Ulis[0]
return gDataWorkclass
def generalizeMaritalStat(data):
Ulis=[]
for i in data:
if i["maritalStatus"] in Ulis:
continue
else:
Ulis.append(i["maritalStatus"])
i=1
Ulis=RemDup(Ulis)
while(len(Ulis)>1):
for j in HHTMarital[i]:
for k in Ulis:
if k in HHTMarital[i][j]:
Ulis[Ulis.index(k)]=j
Ulis=RemDup(Ulis)
i=i+1
gMaritalStat=Ulis[0]
return gMaritalStat
def generalizeSex(data):
Ulis=[]
for i in data:
if i["sex"] in Ulis:
continue
else:
Ulis.append(i["sex"])
i=1
Ulis=RemDup(Ulis)
while(len(Ulis)>1):
for j in HHTSex[i]:
for k in Ulis:
if k in HHTSex[i][j]:
Ulis[Ulis.index(k)]=j
Ulis=RemDup(Ulis)
i=i+1
gDataSex=Ulis[0]
return gDataSex