-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathXLSX_replace_values_via_mapping.py
144 lines (107 loc) · 5.23 KB
/
XLSX_replace_values_via_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Script for replacing cell values in XLSX based on mapping
# get new words from several CSV files and replace old words in XLSX files
# written for the DigiKAR project by Monika Barget in November 2022
import xlsxwriter
import csv
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
# define files containing ontological mapping
event_ontology='C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\InputLists\\event_ontology.csv'
title_ontology='C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\InputLists\\title_ontology.csv'
function_ontology='C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\InputLists\\function_ontology.csv'
place_ontology='C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\InputLists\\place_ontology.csv'
# open ontology files
with open(event_ontology, encoding="utf-8", errors="ignore") as f_event:
data_e = pd.read_csv(f_event, sep=";")
events_old=data_e['event_old'].values
events_new=data_e['event_type'].values
func_new=data_e['pers_function'].values
#with open(title_ontology, encoding="utf-8", errors="ignore") as f_title: # MAPPING NOT COMPLETE
#data_t = pd.read_csv(f_title, sep=";")
#title_old=data_t['title_old'].values
#events_new=data_t['per_title'].values
#with open(function_ontology, encoding="utf-8", errors="ignore") as f_function: # MAPPING NOT COMPLETE
#data_f = pd.read_csv(f_function, sep=";")
#function_old=data_f['function_old'].values
#function_new=data_f['pers_function'].values
with open(place_ontology, encoding="utf-8", errors="ignore") as f_place:
data_p = pd.read_csv(f_place, sep=";")
places_old=data_p['place_old'].values
places_new=data_p['place_new'].values
# function to process data
def extract_information(filenames):
# read all excel files in directory as one data frame
frame_list=[]
for item in os.listdir(filenames):
file = os.path.join(filenames, item)
print(file)
df = pd.read_excel(file, sheet_name='FactoidList', index_col=None, dtype=str) # axis=1, sort=False
df = df.fillna("@") # replace empty fields for string
frame_list.append(df)
f = pd.concat(frame_list, axis=0, ignore_index=False, sort=False)
# replace words in EVENT column & check if corresponding function needs to be updated
for e_old in events_old:
print(type(e_old))
try:
e_new=data_e.loc[data_e['event_old'] == e_old, 'event_type'].values[0]
print(e_new)
f['event_name'] = f['event_name'].replace(e_old, e_new)
# check if event results in a specific function and add it if necessary
f_rel=data_e.loc[data_e['title_old'] == e_old, 'pers_function'].values[0]
if f_rel==True:
f['title_old'] = f['title_old'].replace(e_old, e_new)
else:
print("No function found.")
continue
except KeyError:
print("Key Error")
continue
# find "hidden" places rows and add values to PLACE column
for p in places_new:
print(p)
try:
p_add=f[f["place_new"].map(lambda place_new: p in place_new) & f["inst_name"].map(lambda inst_name: p in inst_name)]
print(p_add)
# Still raises ValueError: Columns must be same length as key
# Code will be fixed ASAP
f['place_name'] =(f['place_name'].map(str) + "/" + p_add)
except KeyError:
print("Key Error")
continue
print(f)
# write all results to new EXCEL file
workbook='C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\Results\\ProfEvents_FINAL\\Profs_mapped.xlsx'
writer = pd.ExcelWriter(workbook, engine='xlsxwriter') # create a Pandas Excel writer using XlsxWriter as the engine.
f.to_excel(writer, sheet_name='Mapped2') # Convert the dataframe to an XlsxWriter Excel object.
writer.save() # Close the Pandas Excel writer and output the Excel file.
# MAIN FUNCTION: iterate through all XLSX files in directoy
if __name__ == '__main__':
filenames = "C:\\Users\\mobarget\\Documents\\Seafile\\DigiKAR_DATEN\\Python\\Results\\ProfEvents_MAPPING"
extract_information(filenames)
print("Done.")
# ADDITIONAL OPTIONS to be added when all ontology lists are in place:
'''
# replace words in TITLE column
for t_old in title_old:
print(t_old)
try:
t_new=data.loc[data['title_old'] == t_old, 'pers_title'].values[0]
print(e_new)
f['title_old'] = f['title_old'].replace(t_old, t_new)
except KeyError:
print("Key Error")
continue
# replace words in FUNCTION column
for f_old in function_old:
print(f_old)
try:
f_new=data_f.loc[data_f['event_name'] == f_old, 'event_type'].values[0]
print(f_new)
f['event_name'] = f['event_name'].replace(f_old, f_new)
except KeyError:
print("Key Error")
continue
print(f)
'''