-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_text.py
56 lines (40 loc) · 1.48 KB
/
clean_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from pathlib import Path
from data.mongo_fields import Publications
from data.mongo_provider import MongoProvider
from text_processing import text_cleaner
collection = MongoProvider(Path.home() / '.dsca' / 'app.config').get_publications_collection()
def get_text(doc):
title = doc.get(Publications.TITLE.mongo, '')
abstract = doc.get(Publications.ABSTRACT.mongo, '')
if title or abstract:
return doc.get(Publications.TITLE.mongo, '') + ' ' + doc.get(Publications.ABSTRACT.mongo, '')
else:
return ''
def pipeline(text):
cleaned_text = text_cleaner.clean_text(text)
return cleaned_text
def main():
docs = collection.find({}, {Publications.ABSTRACT.mongo: 1, Publications.TITLE.mongo: 1})
size = collection.count_documents({Publications.CLEAN_TEXT.mongo: {'$exists': 0}})
print('Number of docs:', size)
status = 0
for doc in docs:
status += 1
if status % 10 == 0:
print('STATUS:', status)
text = get_text(doc)
clean_text = text_cleaner.clean_text(text)
if clean_text:
filter_doc = {
Publications.PMID.mongo: doc.get(Publications.PMID.mongo)
}
update_doc = {
'$set': {
Publications.CLEAN_TEXT.mongo: clean_text
}
}
collection.update_one(filter=filter_doc, update=update_doc)
print('STATUS:', status)
print('Done.')
if __name__ == '__main__':
main()