From d83de07a2ff7c7fac3a0803a9fc4ae0da73ad5e3 Mon Sep 17 00:00:00 2001 From: Hardik Patel Date: Thu, 27 Apr 2017 13:33:18 -0400 Subject: [PATCH] Fixes for better tokenization of words --- .../textdata/textdata/spiders/investopedia.py | 85 +++++++++++++++++++ data/textdata/textdata/spiders/qplum.py | 5 +- data/textdata/textdata/spiders/wikipedia.py | 2 +- 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 data/textdata/textdata/spiders/investopedia.py diff --git a/data/textdata/textdata/spiders/investopedia.py b/data/textdata/textdata/spiders/investopedia.py new file mode 100644 index 0000000..abce754 --- /dev/null +++ b/data/textdata/textdata/spiders/investopedia.py @@ -0,0 +1,85 @@ +from string import ascii_lowercase + +import scrapy +from scrapy.spiders import CrawlSpider +from w3lib.html import remove_tags, remove_tags_with_content + + +class InvestopediaSpider(CrawlSpider): + name = 'investopedia' + start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1'] + + def parse(self, response): + """ + Parse the response page + """ + url = response.url + + # 'terms' has to be there in the URL to proceed further + if 'terms' not in url: + return + + # if the url ends with '.asp', then that's a topic page + if url.endswith('.asp'): + return self._parse_topic_response(response) + + # Otherwise, assume that this a list page + return self._parse_topic_list(response) + + def _parse_topic_response(self, response): + """ + Parses various topics + e.g. www.investopedia.com/terms/o/oddlottheory.asp + """ + # Get the title first + title = response.css('title::text').extract_first() + + # Replace / with a space - creates issues with writing to file + title = title.replace('/', ' ') + + # Get the first div with id Content + content = response.css('div#Content')[0] + content = content.css('div.content-box') + + text = '' + for child in content.xpath('//p'): + + # Get the text from this child

tag + paragraph = child.extract() + + # Remove tags including

and + paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip() + + # Replace '&' with '&' + paragraph = paragraph.replace('&', '&') + + # Replace 'U.S.' with 'US': + paragraph = paragraph.replace('U.S.', 'US') + + # Some more replacements to improve the default tokenization + for c in '();.,[]"\'-:/%$+@': + paragraph = paragraph.replace(c, ' {} '.format(c)) + + # Add to the file + text += paragraph + '\n' + + # Save the title and the text both + filename = 'investopedia_data.txt' + f = open(filename, 'a') + f.write(text) + f.close() + + def _parse_topic_list(self, response): + """ + Parse the page with the topics listed out + e.g. www.investopedia.com/terms/o/ + """ + list_element = response.css('ol.list') + + # Iterate through the list of topics + for l in list_element.css('li'): + # Extract the URL + url = l.css('a::attr(href)').extract_first() + + next_page = response.urljoin(url) + yield scrapy.Request(next_page, callback=self.parse) diff --git a/data/textdata/textdata/spiders/qplum.py b/data/textdata/textdata/spiders/qplum.py index ae49db8..8a789b1 100644 --- a/data/textdata/textdata/spiders/qplum.py +++ b/data/textdata/textdata/spiders/qplum.py @@ -36,10 +36,13 @@ def parse(self, response): paragraph = re.sub("&.....;", ' ', paragraph) paragraph = re.sub("&....;", ' ', paragraph) + # Replace 'U.S.' with 'US': + paragraph = paragraph.replace('U.S.', 'US') + # Some more replacements to improve the default tokenization for c in ['\n', '\r', '\t']: paragraph = paragraph.replace(c, ' ') - for c in '();.,[]"\'-:/%$+': + for c in '();.,[]"\'-:/%$+@': paragraph = paragraph.replace(c, ' {} '.format(c)) filename = 'qplum_data.txt' diff --git a/data/textdata/textdata/spiders/wikipedia.py b/data/textdata/textdata/spiders/wikipedia.py index 2ed10c0..7f6963e 100644 --- a/data/textdata/textdata/spiders/wikipedia.py +++ b/data/textdata/textdata/spiders/wikipedia.py @@ -51,7 +51,7 @@ def _parse_topic_response(self, response): paragraph = paragraph.replace('U.S.', 'US') # Some more replacements to improve the default tokenization - for c in '();.,[]"\'-:/%$+': + for c in '();.,[]"\'-:/%$+@': paragraph = paragraph.replace(c, ' {} '.format(c)) # Add to the file