-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_names.py
56 lines (39 loc) · 1.47 KB
/
parse_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from enum import Enum
import pandas as pd
import json
import math
from utils import clean_arabic_chars, clean_irabs
class Gender(Enum):
male = "MALE"
female = "FEMALE"
unknown = "UNKNOWN"
def parse_csv(input_filename):
df = pd.read_csv(input_filename)
json_data = dict({})
if "first_name" in df.columns:
name_column = "first_name"
elif "Naam" in df.columns:
name_column = "Naam"
else:
return None
for _, row in df.iterrows():
name = clean_irabs(clean_arabic_chars(row[name_column]))
gender = Gender.unknown
if "Pesar" in df.columns and not math.isnan(row["Pesar"]):
gender = Gender.male
elif "Dokhtar" in df.columns and not math.isnan(row["Dokhtar"]):
gender = Gender.female
row_data = {"name": name, "gender": gender.value}
json_data[name] = row_data
return json_data
if __name__ == "__main__":
names_1_jsondata = parse_csv(input_filename="names_1.csv")
names_2_jsondata = parse_csv(input_filename="names_2.csv")
names_json_data = {**names_1_jsondata, **names_2_jsondata}
names = json.dumps(names_json_data, ensure_ascii=False)
filename = "names.json"
with open(filename, "w", encoding="utf-8") as f:
f.write(names)
print(f"First file contains {len(names_1_jsondata)} names.")
print(f"Second file contains {len(names_2_jsondata)} names.")
print(f"Finally {len(names_json_data)} distinct names parsed.")