-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
140 lines (110 loc) · 5.09 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# coding: utf-8
__author__ = 'Paul Cunningham'
__copyright = 'Copyright 2024, Paul Cunningham'
import logging
import pathlib
import camelot
import petl
import re
logger = logging.getLogger()
POST_CODE_AREA_RE = r'([A-Z]){1,2}'
# Page 1-7 are Zone 1
# Page 7 are Zone 2 and Zone 3
# Zone is in column 0
# Area is in Column 1
# Location is in Column 2
# Zone is in Column 3
# Post Codes Columns 4 - 12
class Extractor(object):
def __init__(self, input_filename):
self.input_filename = input_filename
self.unique_area_location_table = None
self.unique_postcode_table = None
def init_temp_tables(self):
self.unique_area_location_table = petl.empty().setheader(['Area', 'Location'])
self.unique_postcode_table = petl.empty().setheader(['Post Code', 'Area'])
@staticmethod
def supported_file_extensions():
return ['.json', '.csv', '.xlsx']
def post_code_generator(self, row):
for col in row:
if col != '':
yield col, re.search(POST_CODE_AREA_RE, col).group(0)
def extract_table(self, table, skip_row_count):
table = petl.fromdataframe(table.df, include_index=False)
table = petl.skip(table, skip_row_count)
table = petl.cutout(table, "Zone")
page_unique_area_location = petl.selectne(table, "Area", '').cut("Area", "Location")
self.unique_area_location_table = petl.stack(self.unique_area_location_table, page_unique_area_location)
page_post_codes_table = petl.cutout(table, "Area", "Location")
page_unique_postcode_table = petl.rowmapmany(page_post_codes_table, self.post_code_generator,
header=['Post Code'])
self.unique_postcode_table = petl.stack(self.unique_postcode_table, page_unique_postcode_table)
def convert_table(self, zone):
area_location_lookup = petl.lookup(self.unique_area_location_table, 'Area', 'Location')
# Add the Zone, Location columns
output_table = petl.addfield(self.unique_postcode_table, "Zone", zone, 0)
output_table = petl.addfield(output_table, "Location", None, 3)
# fill in Location via lookup use Area as key
output_table = petl.convert(
output_table,
'Location',
lambda v, row: area_location_lookup[row["Area"]][0],
pass_row=True
)
return output_table
def extract(self):
zone_1_tables_page_1_6 = camelot.read_pdf(self.input_filename, flavor='stream', pages='1-6')
zone_1_tables_page_7 = camelot.read_pdf(self.input_filename, flavor='stream', pages='7', table_areas=['40, 770, 537, 537'])
zone_2_tables_page_7 = camelot.read_pdf(self.input_filename, flavor='stream', pages='7',
table_areas=['40,510,537,320'])
zone_3_tables_page_7 = camelot.read_pdf(self.input_filename, flavor='stream', pages='7',
table_areas=['40,290,537,180'])
# Zone 1 Page 1 - 6
self.init_temp_tables()
for table in zone_1_tables_page_1_6:
self.extract_table(table, skip_row_count=3)
# fix up problematic areas ('KT', 'Kingston-Upon-Thames'), ('SE', 'London - South East')
self.unique_area_location_table = petl.convert(
self.unique_area_location_table,
"Location",
lambda v: 'Kingston-Upon-Thames',
where=lambda r: r['Area'] == 'KT'
)
self.unique_area_location_table = petl.convert(
self.unique_area_location_table,
"Location",
lambda v: 'London - South East',
where=lambda r: r['Area'] == 'SE'
)
output_table = self.convert_table(1)
# Zone 1 Page 7
self.init_temp_tables()
for table in zone_1_tables_page_7:
self.extract_table(table, skip_row_count=4)
output_table = petl.stack(output_table, self.convert_table(1))
# Zone 2 Page 7
self.init_temp_tables()
for table in zone_2_tables_page_7:
self.extract_table(table, skip_row_count=2)
output_table = petl.stack(output_table, self.convert_table(2))
# Zone 3 Page 7
self.init_temp_tables()
for table in zone_3_tables_page_7:
self.extract_table(table, skip_row_count=2)
output_table = petl.stack(output_table, self.convert_table(3))
return output_table
def save_to(self, output_filename):
ext = pathlib.Path(output_filename).suffix.lower()
if ext not in Extractor.supported_file_extensions():
logger.error(
f"Unsupported output file:{ext}, extensions supported are: {','.join(Extractor.supported_file_extensions())}")
return
output_table = self.extract()
ext = pathlib.Path(output_filename).suffix.lower()
if ext == '.xlsx':
petl.io.xlsx.toxlsx(output_table, output_filename)
elif ext == '.csv':
petl.tocsv(output_table, output_filename)
elif ext == '.json':
petl.tojson(output_table, output_filename)