Skip to content

Commit

Permalink
Fixes for better tokenization of words
Browse files Browse the repository at this point in the history
  • Loading branch information
hardikp committed Apr 27, 2017
1 parent 6afecba commit d83de07
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 2 deletions.
85 changes: 85 additions & 0 deletions data/textdata/textdata/spiders/investopedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from string import ascii_lowercase

import scrapy
from scrapy.spiders import CrawlSpider
from w3lib.html import remove_tags, remove_tags_with_content


class InvestopediaSpider(CrawlSpider):
name = 'investopedia'
start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']

def parse(self, response):
"""
Parse the response page
"""
url = response.url

# 'terms' has to be there in the URL to proceed further
if 'terms' not in url:
return

# if the url ends with '.asp', then that's a topic page
if url.endswith('.asp'):
return self._parse_topic_response(response)

# Otherwise, assume that this a list page
return self._parse_topic_list(response)

def _parse_topic_response(self, response):
"""
Parses various topics
e.g. www.investopedia.com/terms/o/oddlottheory.asp
"""
# Get the title first
title = response.css('title::text').extract_first()

# Replace / with a space - creates issues with writing to file
title = title.replace('/', ' ')

# Get the first div with id Content
content = response.css('div#Content')[0]
content = content.css('div.content-box')

text = ''
for child in content.xpath('//p'):

# Get the text from this child <p></p> tag
paragraph = child.extract()

# Remove tags including <p> and <a>
paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()

# Replace '&amp;' with '&'
paragraph = paragraph.replace('&amp;', '&')

# Replace 'U.S.' with 'US':
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in '();.,[]"\'-:/%$+@':
paragraph = paragraph.replace(c, ' {} '.format(c))

# Add to the file
text += paragraph + '\n'

# Save the title and the text both
filename = 'investopedia_data.txt'
f = open(filename, 'a')
f.write(text)
f.close()

def _parse_topic_list(self, response):
"""
Parse the page with the topics listed out
e.g. www.investopedia.com/terms/o/
"""
list_element = response.css('ol.list')

# Iterate through the list of topics
for l in list_element.css('li'):
# Extract the URL
url = l.css('a::attr(href)').extract_first()

next_page = response.urljoin(url)
yield scrapy.Request(next_page, callback=self.parse)
5 changes: 4 additions & 1 deletion data/textdata/textdata/spiders/qplum.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,13 @@ def parse(self, response):
paragraph = re.sub("&.....;", ' ', paragraph)
paragraph = re.sub("&....;", ' ', paragraph)

# Replace 'U.S.' with 'US':
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in ['\n', '\r', '\t']:
paragraph = paragraph.replace(c, ' ')
for c in '();.,[]"\'-:/%$+':
for c in '();.,[]"\'-:/%$+@':
paragraph = paragraph.replace(c, ' {} '.format(c))

filename = 'qplum_data.txt'
Expand Down
2 changes: 1 addition & 1 deletion data/textdata/textdata/spiders/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def _parse_topic_response(self, response):
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in '();.,[]"\'-:/%$+':
for c in '();.,[]"\'-:/%$+@':
paragraph = paragraph.replace(c, ' {} '.format(c))

# Add to the file
Expand Down

0 comments on commit d83de07

Please sign in to comment.