-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscriptClean.py
445 lines (355 loc) · 24.8 KB
/
transcriptClean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import os
import pandas as pd
import re
import numpy as np
import itertools as it
import csv
from name_splitter import * # Code courtesy of Charles McCallum
from normalization_tools import *
from collections import defaultdict # utility functions to create dictionaries
import pymysql
import argparse
from functools import reduce # for the reduce function
class transcriptCleaner:
def __init__(self, args):
## IMPORTING FILE ========================
print("\nImporting file")
if args.wd:
data = pd.read_csv(os.path.join(args.wd, args.file), encoding = "ISO-8859-1", dtype = "object")
else:
data = pd.read_csv(args.file, encoding = "ISO-8859-1", dtype = "object")
self.data = data.fillna("")
self.data["filename"]
self.errorLog = defaultdict(list)
## CREATING REFERNECE LISTS ========================
print("\nCreating reference lists for some fields")
conn = pymysql.connect(host = "gall.bnhm.berkeley.edu",
user = args.username, # should be args.username
passwd = args.password, # should be args.password
db = "essig")
self.essig_collector = pd.read_sql("select name_full, name_short, collector from eme_people where collector = 1", con = conn)
#essig_canprov = pd.read_sql("select * from canadian_provinces", con = conn)
#essig_country = pd.read_sql("select * from country", con = conn)
#essig_mexstate = pd.read_sql("select * from mexican_states", con = conn)
#essig_state = pd.read_sql("select * from state", con = conn)
#essig_county = pd.read_sql("select * from county", con = conn)
self.essig_canprov = pd.read_csv(os.path.join(os.getcwd(), "reference", "essig_canprov.csv"))
self.essig_country = pd.read_csv(os.path.join(os.getcwd(), "reference", "essig_country.csv"), encoding = "latin-1")
self.essig_mexstate = pd.read_csv(os.path.join(os.getcwd(), "reference", "essig_mexstate.csv"), encoding = "latin-1")
self.essig_statecounty = pd.read_csv(os.path.join(os.getcwd(), "reference", "essig_statecounty.csv"))
self.essig_holdinginst = pd.read_csv(os.path.join(os.getcwd(), "reference", "essig_inst.csv"))
def normalizeCollector(self):
print("\nSplitting collector name strings ...")
# Create empty data frame
columns = ["Collector_split", "Collector2_split", "Collector3_split", "Collector4_split", "Collector5_split"]
index = range(len(self.data))
split_collector = pd.DataFrame(index=index, columns=columns)
# Split names
namesplit = NameSplitter()
split_collectors = [namesplit.split(collectors) for collectors in list(self.data['Collector'])]
# Fill new data frame
split_collector = fill_pd(split_collectors, split_collector)
# NORMALIZE COLLECTOR NAMES ========================
print("\nNormalizing collector names based on reference list")
# Build hash table of collectors
self.essig_collector = self.essig_collector.fillna("")
self.essig_collector['name_short'] = [col.strip("=") for col in list(self.essig_collector['name_short'])] # Remove the "=" characters in the essig database
essig_ref = dict()
for full_name, name_short in zip(self.essig_collector['name_full'], self.essig_collector['name_short']):
#full_name = full_name.encode("latin-1")
#name_short = name_short.encode("latin-1")
if full_name != "":
essig_ref[re.sub('[\W_]+', '', full_name).lower()] = full_name # Remove spaces and non alphanumeric characters from reference list
if name_short != "":
temp = name_short.split(", ")
for sec_name in temp:
essig_ref[re.sub('[\W_]+', '', sec_name).lower()] = full_name # Remove spaces and non alphanumeric characters from reference list
# To deal with non-ascii characters
essig_ref_keys = list(essig_ref.keys())
# Remove spaces and non alphanumeric characters from collector names
split_collector['Collector_split'] = [re.sub('[\W_]+', '', col).lower() for col in split_collector['Collector_split']]
split_collector['Collector2_split'] = [re.sub('[\W_]+', '', col).lower() for col in split_collector['Collector2_split']]
split_collector['Collector3_split'] = [re.sub('[\W_]+', '', col).lower() for col in split_collector['Collector3_split']]
split_collector['Collector4_split'] = [re.sub('[\W_]+', '', col).lower() for col in split_collector['Collector4_split']]
split_collector['Collector5_split'] = [re.sub('[\W_]+', '', col).lower() for col in split_collector['Collector5_split']]
# Check collector names
threshold = 0.9 # Specify similarity threshold
clean_collector1 = refcheck(split_collector['Collector_split'], essig_ref_keys, threshold = threshold)
clean_collector2 = refcheck(split_collector['Collector2_split'], essig_ref_keys, threshold = threshold)
clean_collector3 = refcheck(split_collector['Collector3_split'], essig_ref_keys, threshold = threshold)
clean_collector4 = refcheck(split_collector['Collector4_split'], essig_ref_keys, threshold = threshold)
clean_collector5 = refcheck(split_collector['Collector5_split'], essig_ref_keys, threshold = threshold)
# Create new collector data frame
index = range(len(self.data))
clean_collector = pd.DataFrame(index = index,\
columns = ["Collector", "Collector2", "Collector3", "Collector4", "Collector5"])
clean_collector_cert = pd.DataFrame(index = index,\
columns = ["Collector_cert", "Collector2_cert", "Collector3_cert", "Collector4_cert", "Collector5_cert"])
# Output normalized collectors and certainty levels
clean_collector["Collector"] = [essig_ref[key] if key is not "" else "" for key in clean_collector1[0]]
clean_collector["Collector2"] = [essig_ref[key] if key is not "" else "" for key in clean_collector2[0]]
clean_collector["Collector3"] = [essig_ref[key] if key is not "" else "" for key in clean_collector3[0]]
clean_collector["Collector4"] = [essig_ref[key] if key is not "" else "" for key in clean_collector4[0]]
clean_collector["Collector5"] = [essig_ref[key] if key is not "" else "" for key in clean_collector5[0]]
clean_collector_cert["Collector_cert"] = clean_collector1[1]
clean_collector_cert["Collector2_cert"] = clean_collector2[1]
clean_collector_cert["Collector3_cert"] = clean_collector3[1]
clean_collector_cert["Collector4_cert"] = clean_collector4[1]
clean_collector_cert["Collector5_cert"] = clean_collector5[1]
# Export collectors
self.clean_collector = clean_collector
self.clean_collector_cert = clean_collector_cert
def normalizeDates(self):
index = range(len(self.data))
# Create empty date dataframes
split_begin_date = pd.DataFrame(index = index, columns = ["MonthCollected", "DayCollected" , "YearCollected"] )
split_end_date = pd.DataFrame(index = index, columns = ["MonthCollected2","DayCollected2", "YearCollected2"])
# String split dates and populate dataframes
split_begin_dates = [date.split("/") for date in self.data["Begin Date Collected"]]
split_begin_date = fill_pd(x = split_begin_dates, pd = split_begin_date)
split_end_dates = [date.split("/") for date in self.data["End Date Collected"]]
split_end_date = fill_pd(x = split_end_dates, pd = split_end_date)
# Deal with zeros (should coerced to be an empty field)
split_begin_date["MonthCollected"] = [date.replace('00', '') for date in split_begin_date["MonthCollected"]]
split_begin_date["DayCollected"] = [date.replace('00', '') for date in split_begin_date["DayCollected"]]
split_begin_date["YearCollected"] = [date.replace('0000', '') for date in split_begin_date["YearCollected"]]
split_end_date["MonthCollected2"] = [date.replace('00', '') for date in split_end_date["MonthCollected2"]]
split_end_date["DayCollected2"] = [date.replace('00', '') for date in split_end_date["DayCollected2"]]
split_end_date["YearCollected2"] = [date.replace('0000', '') for date in split_end_date["YearCollected2"]]
# Remove incomplete dates where day but not month was recovered
for i in range(len(split_begin_date)):
if split_begin_date["DayCollected"][i] != "" and split_begin_date["MonthCollected"][i] == "":
split_begin_date["DayCollected"][i] = ""
self.errorLog[self.data["bnhm_id"][i]].append("Incomplete collection begin date")
if split_begin_date["MonthCollected"][i] == "" and split_begin_date["DayCollected"][i] != "":
split_begin_date["DayCollected"][i] == ""
self.errorLog[self.data["bnhm_id"][i]].append("Incomplete collection begin date")
for i in range(len(split_end_date)):
if split_end_date["DayCollected2"][i] != "" and split_end_date["MonthCollected2"][i] == "":
split_end_date["DayCollected2"][i] = ""
self.errorLog[self.data["bnhm_id"][i]].append("Incomplete collection end date")
if split_end_date["MonthCollected2"][i] == "" and split_end_date["DayCollected2"][i] != "":
split_end_date["DayCollected2"][i] == ""
self.errorLog[self.data["bnhm_id"][i]].append("Incomplete collection end date")
# Remove nonsensical dates as ambiguity
for i in range(len(split_begin_date)):
if split_begin_date["MonthCollected"][i] in ["02", "04", "06", "09", "11"] and split_begin_date["DayCollected"][i] == "31":
split_begin_date["DayCollected"][i] = ""
self.errorLog[self.data["bnhm_id"][i]].append("Nonsensical collection begin date")
for i in range(len(split_end_date)):
if split_end_date["MonthCollected2"][i] in ["02", "04", "06", "09", "11"] and split_end_date["DayCollected2"][i] == "31":
split_end_date["DayCollected2"][i] = ""
self.errorLog[self.data["bnhm_id"][i]].append("Nonsensical collection end date")
self.begin_date = split_begin_date
self.end_date = split_end_date
def prepMetadata(self):
index = range(len(self.data))
# Create dictionary of holding institutions
self.essig_holdinginst['Code'] = [abbrev[0:3] for abbrev in self.essig_holdinginst['Abbrev']]
essig_holdinginst_dict = self.essig_holdinginst.set_index('Code')['Name'].to_dict()
# Create empty dataframes to hold cleaned up entries
taxa_metadata = pd.DataFrame(index = index,\
columns = ["bnhm_id", "Genus", "SpecificEpithet", "SubspecificEpithet"])
other_metadata = pd.DataFrame(index = index,\
columns = ["Taxon_Certainty", "HoldingInstitution", "EnteredBy", "BasisOfRecord", "IndividualCount"])
# Remove jpg names
self.data["filename"] = [fname.split("|")[0] for fname in self.data["filename"]] # new metadata in transcriptResolver function just concatenates them together, so will have to pick the first duplicate
taxon = [name.replace('.jpg', '') for name in list(self.data["filename"])]
# Parse out taxon uncertainty
temp = []
for name in taxon:
m = re.search("_.*", name)
if m == None:
temp.append("")
else:
temp.append(m.group(0))
other_metadata["Taxon_Certainty"] = temp
# Remove underscores from taxon_certainty
other_metadata["Taxon_Certainty"] = [taxon_cert.strip('_') for taxon_cert in other_metadata["Taxon_Certainty"]]
# Standardize taxon certainty entries (Valid entries = ("aff.", "cf.", "gen. nr.", "near", "nr.", "poss.", "sp. nr.", "?"))
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('aff', 'aff.') for taxon_cert in other_metadata["Taxon_Certainty"]]
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('cf', 'cf.') for taxon_cert in other_metadata["Taxon_Certainty"]]
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('gen. nr', 'gen. nr.') for taxon_cert in other_metadata["Taxon_Certainty"]]
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('nr', 'nr.') for taxon_cert in other_metadata["Taxon_Certainty"]]
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('poss', 'poss.') for taxon_cert in other_metadata["Taxon_Certainty"]]
other_metadata["Taxon_Certainty"] = [taxon_cert.replace('U', '?') for taxon_cert in other_metadata["Taxon_Certainty"]]
# Split taxa names, but take care of possible hybrids
taxon = [re.sub("_.*", " ", name).strip() for name in taxon]
split_taxon = [name.split() for name in taxon]
for x in range(len(split_taxon)):
temp = split_taxon[x]
if len(temp) > 4:
if temp[3] == "x" or temp[3] == "X":
temp[2] = " ".join([temp[2], temp[3], temp[4]])
split_taxon[x] = temp[0:3]
# Fill taxa metadata dataframe
taxa_metadata = fill_pd(split_taxon, taxa_metadata)
taxa_metadata['SpecificEpithet'] = [sp.replace('sp', 'sp.') for sp in taxa_metadata['SpecificEpithet']]
# Remove filename versions (e.g., EMEC 12345.0 from bnhm_ids)
taxa_metadata['bnhm_id'] = [re.sub("\.*", "", k) for k in taxa_metadata['bnhm_id']]
# Dealing with subsequent changes in institution code
taxa_metadata['bnhm_id'] = [re.sub('^CIS*', 'UCIS', k) for k in taxa_metadata['bnhm_id']]
# Match abbreviations with respective holding institution
other_metadata['HoldingInstitution'] = [bnhm_id[0:3] for bnhm_id in taxa_metadata["bnhm_id"]]
other_metadata['HoldingInstitution'] = [essig_holdinginst_dict[k] for k in other_metadata['HoldingInstitution']]
# Populate misc database fields
other_metadata['IndividualCount'] = 1
other_metadata['EnteredBy'] = 'Notes from Nature'
other_metadata['BasisOfRecord'] = 'PreservedSpecimen'
other_metadata['LifeStage'] = 'adult'
other_metadata['PreparationType'] = 'pin'
metadata = pd.concat([taxa_metadata, other_metadata], axis = 1)
self.data['bnhm_id'] = taxa_metadata['bnhm_id']
self.metadata = metadata
def normalizeGeography(self):
index = range(len(self.data))
# Create an empty dataframe to hold cleaned up entries
split_location = pd.DataFrame(index=index, columns=["Country", "StateProvince", "County", "ContinentOcean"])
# Ad-hoc spelling changes due to differences in Zoouniverse country list and Essig's country name standards
self.data['Country'] = [cty.replace('Afganistan','Afghanistan') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('United Arab Erimates','United Arab Emirates') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Iran','Iran, Islamic Republic of') for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Cote DIvoire","Cote D'Ivoire") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Curaco","Curacao") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Korea Sout","Korea, Republic of") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Korea South","Korea, Republic of") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Korea North","Korea, Democratic People's Republic of") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Nambia","Namibia") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Philipines","Philippines") for cty in self.data['Country']]
self.data['Country'] = [cty.replace("Uraguay","Uruguay") for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Iran','Iran, Islamic Republic of') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Great Britain','United Kingdom') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Trinidad & Tobago','Trinidad and Tobago') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Vietnam','Viet Nam') for cty in self.data['Country']]
self.data['Country'] = [cty.replace('Antigua & Barbuda','Antigua and Barbuda') for cty in self.data['Country']]
self.data['Country'] = [re.sub('^US$|^U\\.S\\.$|^U\\.S\\.A\\.$|^USA$|United States of America|united states','United States', cty, flags = re.IGNORECASE) for cty in self.data['Country']]
# Normalizing country name
# Check if country is in reference database, if there isn't an exact match, assign an NA
print("\nChecking if country entries are valid ...")
country_ref = list(self.essig_country['name'])
country_index = [country_ref.index(cty)
if cty in country_ref else "NA"
for cty in self.data['Country']]
# Populate continent ocean based on country
print("\nPopulating continent field ...")
continent_ref = list(self.essig_country['continent'])
split_location["Country"] = [country_ref[index]
if index is not "NA" else "NA"
for index in country_index]
split_location["ContinentOcean"] = [continent_ref[index]
if index is not "NA" else "NA"
for index in country_index]
# Normalize states/province
print("\nChecking if state and province entries are valid ...")
state_abb_ref = list(self.essig_statecounty["State.Abbrev"])
state_name_ref = list(self.essig_statecounty["State"])
state_index = [
state_abb_ref.index(state) if state in state_abb_ref
else state_name_ref.index(state) if state in state_name_ref
else "NA"
for state, cty in zip(self.data['State/Province'], split_location['Country']) if cty == "United States"]
split_location.loc[split_location['Country'] == "United States", 'StateProvince'] = [state_name_ref[index]
if index is not "NA" else "NA" for index in state_index]
canada_ref = list(self.essig_canprov['name'])
canada_index = [
canada_ref.index(province)
if province in canada_ref
else "NA"
for province, cty in zip(self.data['State/Province'], split_location['Country']) if cty == "Canada"]
split_location.loc[split_location['Country'] == "Canada", 'StateProvince'] = [canada_ref[index]
if index is not "NA" else "NA" for index in canada_index]
mexico_ref = list(self.essig_mexstate['name'])
mexico_index = [
mexico_ref.index(mexstate)
if mexstate in mexico_ref
else "NA"
for mexstate, cty in zip(self.data['State/Province'], split_location['Country']) if cty == "Mexico"]
split_location.loc[split_location['Country'] == "Mexico", 'StateProvince'] = [mexico_ref[index]
if index is not "NA" else "NA" for index in mexico_index]
# Normalize county
print("\nChecking if county fields (only for the U.S.) are valid ...")
county_ref = list(self.essig_statecounty['County'])
county_index = [
# Find the county in the state list
county_ref.index(county) if county in county_ref #list(essig_statecounty.loc[essig_statecounty["State"] == state, 'County'])
# essig_statecounty.loc[essig_statecounty["State"] == state, 'County'].index[list(essig_statecounty.loc[essig_statecounty["State"] == state, 'County']).index(county)]
else "NA"
for county, state, cty in zip(self.data['County'], split_location['StateProvince'], split_location['Country']) if cty == "United States"]
split_location.loc[split_location['Country'] == "United States", 'County'] = [county_ref[index]
if index is not "NA" else "NA" for index in county_index]
# Append 'county', 'parish' and 'borough' to US counties
split_location = split_location.fillna("")
split_location["County"] = ["" if county == "NA" or county == ""
else county + " Parish" if state == "Louisiana"
else county + " Borough" if state == "Alaska"
else county + " County"
for county , state in zip(split_location["County"], split_location["StateProvince"])]
self.geography = split_location
def main():
## PREAMBLE ========================
print("\n\n\n")
print("=" * 50)
print("PREPARING TRANSCRIPTS FOR RESOLUTION!!")
print("Written by Jun Ying Lim (junyinglim@gmail.com) \nfor the Essig Museum of Entomology at UC Berkeley")
print("=" * 50)
print("\n\n\n")
## CREATE TRANSCRIPTCLEANER INSTANCE ========================
args = parser.parse_args()
pipeline = transcriptCleaner(args)
## CLEANUP ========================
## Prepare metadata
# Metadata has to prepped first as the bnhm_id will be used to log errors in dates
print("\nPreparing metadata fields in the resolved transcriptions ...")
pipeline.prepMetadata()
metadata = pipeline.metadata
## Normalize collectors
print("\nNormalizing collector names in the resolved transcriptions ...")
pipeline.normalizeCollector()
collector = pipeline.clean_collector
## Normalize dates
print("\nNormalizing dates in the resolved transcriptions ...")
pipeline.normalizeDates()
begin_date = pipeline.begin_date
end_date = pipeline.end_date
## Normalize geography fields
print("\nNormalizing geography fields in the resolved transcriptions ...")
pipeline.normalizeGeography()
geography = pipeline.geography
geography["Country"] = [x.replace("NA", "") for x in geography["Country"]]
geography["StateProvince"] = [x.replace("NA", "") for x in geography["StateProvince"]]
geography["County"] = [x.replace("NA", "") for x in geography["County"]]
geography["ContinentOcean"] = [x.replace("NA", "") for x in geography["ContinentOcean"]]
## MERGING RESULTS ========================
print("\nCleaning complete; merging results")
cleanList = [metadata, collector, begin_date, end_date, geography, pipeline.data[["subject_id", "id", "Locality"]]]
allClean = pd.concat(cleanList, axis = 1)
allClean.rename(columns={'subject_id':'TranscriptionSubjectIDs', "id":"TranscriptionIDs"}, inplace=True)
allClean = allClean.fillna("") # remove all NAs
# Identify duplicates and log them
duplicates = list(allClean["bnhm_id"][allClean.duplicated(keep = False)])
for x in duplicates:
pipeline.errorLog[x].append("Potential duplicate")
# Remove duplicate bnhm_id numbers
allClean = allClean.drop_duplicates('bnhm_id', keep = False)
## EXPORTING RESULTS ========================
print("\nExporting results to", args.wd, "...")
if args.output:
outputfile = args.output
else:
outputfile = "clean_transcript"
allClean.to_csv(os.path.join(args.wd, outputfile + ".csv"), index = False)
## PRINT OUT ERRORS ========================
if(len(pipeline.errorLog) > 0):
print("\nExporting potential errors \n")
with open(os.path.join(args.wd, 'error_transcript.csv'), 'w') as f:
for i in pipeline.errorLog:
for j in pipeline.errorLog[i]:
f.write('{0},{1}\n'.format(i, j))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="transcriptClean - Let's clean some transcripts!")
parser.add_argument("-wd", help = "Working directory")
parser.add_argument("-file", "-f", help = "File with transcriptions")
parser.add_argument("-output", "-o", help = "Output file name")
parser.add_argument("-username", help = "Username. Access to essig SQL database")
parser.add_argument("-password", help = "Password. Access to essig SQL database")
main()