-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
244 lines (171 loc) · 8.84 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
"""
Author: Lorenzo Rinaldi, SESAM
"""
import pandas as pd
import bibtexparser
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
import os, fnmatch
import shutil
import copy
_PAPERNAME_METHOD = {'LR': {'items': ['Year','Journal','Author','Title'],
'minimum_requirements': ['Year','Author'],
'separator':'_'}}
_SPECIAL_CHARACTERS = [(' ','_'), ('<',''), ('>',''), (':','-'), ('"',''), ('/','-'),('\\','-'),('?',''),('*','')]
#%%
class functions():
"""
This class contains general functions useful in multiple occasions.
Notes for the contributors:
1) if you add a new function, please make sure you use the same syntax as the functions already implemented
and make sure to add a documentation
2) thank you for the support!!!
"""
def bibtex_parser(bib_path:str, xlsx_path:str, properties=['year', 'title', 'author', 'journal', 'doi'], doi_links=True):
"""
This function generates an .xlsx file from a list of documents contained in a .bib file
Args:
bib_path: string containing the path where to import the .bib file
xlsx_path: string containing the path where to export the .xlsx file
properties: list containing the properties to be parsed from the imported documents
doi_links: bool; if True the doi columns will contain clickable hyperlinks
Returns:
selection: pandas DataFrame containing the parsed information of about the imported documents contained in the .bib file
"""
with open(bib_path, encoding="utf8") as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
df = pd.DataFrame(bib_database.entries)
selection = df[properties]
if 'title' in selection.columns:
title_position = list(selection.columns).index('title')
for i in range(selection.shape[0]):
try:
selection.iloc[i,title_position] = selection.iloc[i,title_position].split('{')[1].split('}')[0]
except:
pass
if 'year' in selection.columns:
year_position = list(selection.columns).index('year')
for i in range(selection.shape[0]):
try:
selection.iloc[i,year_position] = int(selection.iloc[i,year_position])
except:
pass
if 'doi' in selection.columns and doi_links==True:
doi_position = list(selection.columns).index('doi')
for i in range(selection.shape[0]):
try:
selection.iloc[i,doi_position] = 'https://doi.org/'+selection.iloc[i,doi_position]
except:
pass
selection.to_excel(xlsx_path, index=False)
return selection
def get_files_names(path:str, file_extensions:list):
"""
This function generates a list containing names of files contained in a given path
Args:
path: directory to search
file_extension: list of the extensions to be selected
Returns:
filenames: dictionary whose keys are related to the extensions and the values are the strings containing the names of the files with the desired extension
"""
listOfFiles = os.listdir(path)
filenames = {}
for item in file_extensions:
extension = "*.{}".format(item)
filenames[item] = []
for entry in listOfFiles:
if fnmatch.fnmatch(entry, extension):
filenames[item] += [os.path.join(path,entry)]
return filenames
def get_pdf_metadata(file_paths):
"""
This function generates a dictionary containing pdf metadata
Args:
file_paths: list or dictionary containing paths of pdf files to be analysed
file_extension: list of the extensions to be selected
Returns:
filenames: dictionary whose keys are related to the extensions and the values are the strings containing the names of the files with the desired extension
"""
if isinstance(file_paths, dict):
pdf_files = file_paths['pdf']
elif isinstance(file_paths, list):
pdf_files = copy.deepcopy(file_paths)
pdf_metadata = {}
for file in pdf_files:
try:
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
pdf_metadata[file] = {}
for i in doc.info[0].keys():
try:
pdf_metadata[file][i] = doc.info[0][i].decode("utf-8")
except:
pass
fp.close()
except:
print('WARNING: problems with file named "{}"'.format(file))
return pdf_metadata
def rename_articles(metadata:dict, method:str, check=False):
"""
This function renames pdf scientific articles according to pdf metadata
SUPPORT FOR ELSEVIER ARTICLES ONLY AT THE MOMENT
Args:
metadata: dictionary whose keys are the paths of the pdf files, the values are the metadata
method: string representing the renaming criterion, to be defined in the _PAPERNAME_METHOD dictionary in the upper part of this script
check: boolean to check the proceeding of the function step by step and show up when there are mistakes
"""
for file in list(metadata.keys()):
if check==True:
print(file)
print('\n')
"Replacing special characters from title, if available"
try:
title = copy.deepcopy(metadata[file]['Title'])
counter = 0
for i in title:
if i in [spchar[0] for spchar in _SPECIAL_CHARACTERS]:
metadata[file]['Title'] = metadata[file]['Title'][:counter]\
+ [spchar[1] for spchar in _SPECIAL_CHARACTERS][[spchar[0] for spchar in _SPECIAL_CHARACTERS].index(i)]\
+ metadata[file]['Title'][counter+1:]
counter += 1
except:
pass
"Extracting year from CreationDate parameter, if available"
try:
metadata[file]['Year'] = metadata[file]['CreationDate'].split(':')[1][:4]
except:
pass
"Renaming Elsevier articles"
if 'Creator' in list(metadata[file].keys()):
if metadata[file]['Creator'] == 'Elsevier':
try:
metadata[file]['DOI'] = metadata[file]['Subject'].split(' ')[-1]
except:
pass
try:
metadata[file]['Journal'] = metadata[file]['DOI'].split('.')[2]
except:
pass
try:
metadata[file]['Author'] = metadata[file]['Author'].split(' ')[-1]
except:
pass
new_name = []
for item in _PAPERNAME_METHOD[method]['items']:
try:
new_name += [metadata[file][item]]
except:
pass
new_name = _PAPERNAME_METHOD[method]['separator'].join(new_name)+'.pdf'
count_properties = []
for item in _PAPERNAME_METHOD[method]['items']:
if item in new_name:
count_properties += [item]
if set(_PAPERNAME_METHOD[method]['minimum_requirements']) <= set(count_properties):
len_max = 255
if len(new_name) >= len_max:
new_name = new_name[:(len_max-len('.pdf'))]
new_name += '.pdf'
new_name = os.path.join('\\'.join(file.split('\\')[:-1]),new_name)
os.rename(file, new_name)