diff --git a/data/textdata/textdata/spiders/investopedia.py b/data/textdata/textdata/spiders/investopedia.py
new file mode 100644
index 0000000..abce754
--- /dev/null
+++ b/data/textdata/textdata/spiders/investopedia.py
@@ -0,0 +1,85 @@
+from string import ascii_lowercase
+
+import scrapy
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class InvestopediaSpider(CrawlSpider):
+ name = 'investopedia'
+ start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']
+
+ def parse(self, response):
+ """
+ Parse the response page
+ """
+ url = response.url
+
+ # 'terms' has to be there in the URL to proceed further
+ if 'terms' not in url:
+ return
+
+ # if the url ends with '.asp', then that's a topic page
+ if url.endswith('.asp'):
+ return self._parse_topic_response(response)
+
+ # Otherwise, assume that this a list page
+ return self._parse_topic_list(response)
+
+ def _parse_topic_response(self, response):
+ """
+ Parses various topics
+ e.g. www.investopedia.com/terms/o/oddlottheory.asp
+ """
+ # Get the title first
+ title = response.css('title::text').extract_first()
+
+ # Replace / with a space - creates issues with writing to file
+ title = title.replace('/', ' ')
+
+ # Get the first div with id Content
+ content = response.css('div#Content')[0]
+ content = content.css('div.content-box')
+
+ text = ''
+ for child in content.xpath('//p'):
+
+ # Get the text from this child
tag
+ paragraph = child.extract()
+
+ # Remove tags including and
+ paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()
+
+ # Replace '&' with '&'
+ paragraph = paragraph.replace('&', '&')
+
+ # Replace 'U.S.' with 'US':
+ paragraph = paragraph.replace('U.S.', 'US')
+
+ # Some more replacements to improve the default tokenization
+ for c in '();.,[]"\'-:/%$+@':
+ paragraph = paragraph.replace(c, ' {} '.format(c))
+
+ # Add to the file
+ text += paragraph + '\n'
+
+ # Save the title and the text both
+ filename = 'investopedia_data.txt'
+ f = open(filename, 'a')
+ f.write(text)
+ f.close()
+
+ def _parse_topic_list(self, response):
+ """
+ Parse the page with the topics listed out
+ e.g. www.investopedia.com/terms/o/
+ """
+ list_element = response.css('ol.list')
+
+ # Iterate through the list of topics
+ for l in list_element.css('li'):
+ # Extract the URL
+ url = l.css('a::attr(href)').extract_first()
+
+ next_page = response.urljoin(url)
+ yield scrapy.Request(next_page, callback=self.parse)
diff --git a/data/textdata/textdata/spiders/qplum.py b/data/textdata/textdata/spiders/qplum.py
index ae49db8..8a789b1 100644
--- a/data/textdata/textdata/spiders/qplum.py
+++ b/data/textdata/textdata/spiders/qplum.py
@@ -36,10 +36,13 @@ def parse(self, response):
paragraph = re.sub("&.....;", ' ', paragraph)
paragraph = re.sub("&....;", ' ', paragraph)
+ # Replace 'U.S.' with 'US':
+ paragraph = paragraph.replace('U.S.', 'US')
+
# Some more replacements to improve the default tokenization
for c in ['\n', '\r', '\t']:
paragraph = paragraph.replace(c, ' ')
- for c in '();.,[]"\'-:/%$+':
+ for c in '();.,[]"\'-:/%$+@':
paragraph = paragraph.replace(c, ' {} '.format(c))
filename = 'qplum_data.txt'
diff --git a/data/textdata/textdata/spiders/wikipedia.py b/data/textdata/textdata/spiders/wikipedia.py
index 2ed10c0..7f6963e 100644
--- a/data/textdata/textdata/spiders/wikipedia.py
+++ b/data/textdata/textdata/spiders/wikipedia.py
@@ -51,7 +51,7 @@ def _parse_topic_response(self, response):
paragraph = paragraph.replace('U.S.', 'US')
# Some more replacements to improve the default tokenization
- for c in '();.,[]"\'-:/%$+':
+ for c in '();.,[]"\'-:/%$+@':
paragraph = paragraph.replace(c, ' {} '.format(c))
# Add to the file