Skip to content

Commit

Permalink
Minor improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
hardikp committed Apr 27, 2017
1 parent 3673861 commit 6afecba
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion data/textdata/textdata/spiders/qplum.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re

from scrapy.spiders import CrawlSpider
from w3lib.html import remove_tags, remove_tags_with_content
Expand Down Expand Up @@ -29,8 +30,11 @@ def parse(self, response):
# Replace ' with '
paragraph = paragraph.replace(''', "'")
paragraph = paragraph.replace('’', "'")
paragraph = paragraph.replace('“', "'")
paragraph = paragraph.replace('”', "'")
# Replace   with a space
paragraph = paragraph.replace(' ', ' ')
paragraph = re.sub("&.....;", ' ', paragraph)
paragraph = re.sub("&....;", ' ', paragraph)

# Some more replacements to improve the default tokenization
for c in ['\n', '\r', '\t']:
Expand Down

0 comments on commit 6afecba

Please sign in to comment.