-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_ingestion.py
executable file
·171 lines (131 loc) · 5.47 KB
/
data_ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
Load the dataset from ArnetMiner v8 format to MongoDb.
Save the dataset from MongoDb to ArnetMiner v8 format.
"""
import logging
from pymongo import MongoClient
import config
# open the mongo connection
client = MongoClient(config.__host, config.__port, connect=True)
papers_collection = client[config.__db_name][config.__collection_name]
def ingest_dataset(dataset_path, storage_enabled=True):
"""
Note: the first line of a paper description should be the title!!
:param print_enabled: boolean, True to enable print of parsed papers, False otherwise
:param storage_enabled: boolean, True to save parsed papers on db (mongo), False to return a collection
:param dataset_path: the path of the dataset file to process
:return:
"""
with open(dataset_path) as dataset_file:
processed_papers = 0
papers = []
paper = None
line = dataset_file.readline()
while line != "":
multiline = False
if line != None and len(line) != 0 and line[0] == '#':
code = line[0:2]
value = line[2:].strip()
if "#*" == code:
# save previous paper
if paper is not None:
# clean venue
if 'venue' in paper:
paper['cleaned_venue'] = clean_venue(paper['venue'])
if storage_enabled:
papers_collection.insert(paper)
else:
papers.append(paper)
processed_papers += 1
if processed_papers % 10000 == 0:
print("Processed {0} papers".format(processed_papers))
logging.debug(paper)
break
# create new one
paper = {}
paper['acm_citations'] = []
paper['authors'] = []
paper['title'] = value
elif "#@" == code:
paper['authors'] = split_and_clean_authors(value)
elif "#t" == code:
paper['year'] = int(value)
elif "#c" == code:
paper['venue'] = value
elif "#i" == code:
paper['acm_id'] = value[4:]
elif "#%" == code:
paper['acm_citations'].append(value)
elif "#!" == code:
line = dataset_file.readline()
# handle multiline abstract
while line != "" and len(line.strip()) != 0 and line[0] != '#':
value += " " + line
line = dataset_file.readline()
paper['abstract'] = value
multiline = True
else:
# ignore
if line.strip() != "":
logging.warning("[WARNING] Ignored line. Content '{0}'".format(line))
if not multiline: line = dataset_file.readline()
logging.info("[INFO] Inserted documents: {0}".format(processed_papers))
if not storage_enabled:
return papers
def split_and_clean_authors(authors_string):
return [a.strip() for a in authors_string.replace('. ', '.').replace('.', '. ').split(',')]
def clean_venue(venue):
"""
Venues cleaning procedure for dblp dataset.
:param venue:
:return:
"""
return venue.lower().strip('?:!.,;- ')
def to_v8_format(paper):
"""
Transform a paper dictionary to its string representation, following the ArnetMiner V8 format.
For more details on the format refer to: https://aminer.org/citation
:param paper: the paper to convert
:return: the string representation
"""
output_string = ''
if 'title' in paper:
output_string += '#*' + paper['title'] + '\n'
if 'authors' in paper:
output_string += '#@' + ', '.join(paper['authors']) + '\n'
if 'year' in paper:
output_string += '#t' + str(paper['year']) + '\n'
if 'venue' in paper:
output_string += '#c' + paper['venue'] + '\n'
if 'acm_id' in paper:
output_string += '#index' + str(paper['acm_id']) + '\n'
if 'acm_citations' in paper:
for cit in paper['acm_citations']:
output_string += '#%' + str(cit) + '\n'
if 'abstract' in paper:
output_string += '#!' + paper['abstract'] + '\n'
return output_string
def dump_dataset_to_file(output_filename):
"""
Save the dataset contained in papers_collection to file, following the ArnetMiner V8 format.
For more details on the format refer to: https://aminer.org/citation
:param output_filename: the path of the output file
:return:
"""
with open(output_filename, 'w') as out:
papers = papers_collection.find()
total_number = papers_collection.count()
processed = 0
for p in papers:
processed += 1
out.write(to_v8_format(p))
out.write('\n')
if processed % 10000 == 0:
print('Processed {0} of {1}'.format(processed, total_number))
if __name__ == '__main__':
logging.basicConfig(filename='latest_execution.log', level=logging.INFO)
# to ingest a dataset in ArnetMiner v8 format
ingest_dataset(config.__input_dataset_dump)
# to store the dataset in the ArnetMiner v8 format
# output_filename = 'path/to/filename'
# dump_dataset_to_file(output_filename)