-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnormalise_owners.py
59 lines (44 loc) · 1.7 KB
/
normalise_owners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
from collections import defaultdict
from glob import glob
from pathlib import Path
import chardet
import roman
import yaml
def load_owners(filename: str):
with open(filename, 'rb') as input_file:
detected_encoding = chardet.detect(input_file.read())
owners = pd.read_csv(filename, encoding=detected_encoding['encoding'].lower(), index_col=0, dtype={'owner_ID': int, 'owner_descr': str, 'owner_date': str, 'owner_type': str, 'owner_gender': str, 'owner_source': str}, na_values=[""], error_bad_lines=False)
# print("Dropping empty rows")
owners = owners.dropna(how='all')
return owners
def extract_ms_id(filename):
base_name = Path(filename).stem
return base_name.split('_')[1]
def process_owners(filename: str):
file = Path(filename).name
parent_dir = Path(filename).parent.parent
output_dir = parent_dir / "output"
output_dir.mkdir(exist_ok=True)
# print("Trying to open")
owners = load_owners(filename)
owners.to_csv(output_dir / file, encoding="utf-8")
ms_id = extract_ms_id(filename)
return owners, ms_id
def main():
files = list(glob('data/input/owner_*.csv'))
ms_identifiers = []
owners_frames = []
for filename in files:
print("Working on", filename)
try:
frames = process_owners(filename)
owners_frames.append(frames[0])
ms_identifiers.append(frames[1])
print("Done")
except Exception as e:
print("ERROR in {0}: {1}".format(filename, e))
all_owners = pd.concat(owners_frames, keys=ms_identifiers, names=["MS_ID"])
all_owners.to_csv('data/output/all_owners.csv', encoding="utf-8")
if __name__ == '__main__':
main()