From d83de07a2ff7c7fac3a0803a9fc4ae0da73ad5e3 Mon Sep 17 00:00:00 2001
From: Hardik Patel <hardikp12@gmail.com>
Date: Thu, 27 Apr 2017 13:33:18 -0400
Subject: [PATCH] Fixes for better tokenization of words

---
 .../textdata/textdata/spiders/investopedia.py | 85 +++++++++++++++++++
 data/textdata/textdata/spiders/qplum.py       |  5 +-
 data/textdata/textdata/spiders/wikipedia.py   |  2 +-
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 data/textdata/textdata/spiders/investopedia.py
diff --git a/data/textdata/textdata/spiders/investopedia.py b/data/textdata/textdata/spiders/investopedia.py
new file mode 100644
index 0000000..abce754
--- /dev/null
+++ b/data/textdata/textdata/spiders/investopedia.py
@@ -0,0 +1,85 @@
+from string import ascii_lowercase
+
+import scrapy
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class InvestopediaSpider(CrawlSpider):
+    name = 'investopedia'
+    start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']
+
+    def parse(self, response):
+        """
+        Parse the response page
+        """
+        url = response.url
+
+        # 'terms' has to be there in the URL to proceed further
+        if 'terms' not in url:
+            return
+
+        # if the url ends with '.asp', then that's a topic page
+        if url.endswith('.asp'):
+            return self._parse_topic_response(response)
+
+        # Otherwise, assume that this a list page
+        return self._parse_topic_list(response)
+
+    def _parse_topic_response(self, response):
+        """
+        Parses various topics
+        e.g. www.investopedia.com/terms/o/oddlottheory.asp
+        """
+        # Get the title first
+        title = response.css('title::text').extract_first()
+
+        # Replace / with a space - creates issues with writing to file
+        title = title.replace('/', ' ')
+
+        # Get the first div with id Content
+        content = response.css('div#Content')[0]
+        content = content.css('div.content-box')
+
+        text = ''
+        for child in content.xpath('//p'):
+
+            # Get the text from this child <p></p> tag
+            paragraph = child.extract()
+
+            # Remove tags including <p> and <a>
+            paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()
+
+            # Replace '&amp;' with '&'
+            paragraph = paragraph.replace('&amp;', '&')
+
+            # Replace 'U.S.' with 'US':
+            paragraph = paragraph.replace('U.S.', 'US')
+
+            # Some more replacements to improve the default tokenization
+            for c in '();.,[]"\'-:/%$+@':
+                paragraph = paragraph.replace(c, ' {} '.format(c))
+
+            # Add to the file
+            text += paragraph + '\n'
+
+        # Save the title and the text both
+        filename = 'investopedia_data.txt'
+        f = open(filename, 'a')
+        f.write(text)
+        f.close()
+
+    def _parse_topic_list(self, response):
+        """
+        Parse the page with the topics listed out
+        e.g. www.investopedia.com/terms/o/
+        """
+        list_element = response.css('ol.list')
+
+        # Iterate through the list of topics
+        for l in list_element.css('li'):
+            # Extract the URL
+            url = l.css('a::attr(href)').extract_first()
+
+            next_page = response.urljoin(url)
+            yield scrapy.Request(next_page, callback=self.parse)
diff --git a/data/textdata/textdata/spiders/qplum.py b/data/textdata/textdata/spiders/qplum.py
index ae49db8..8a789b1 100644
--- a/data/textdata/textdata/spiders/qplum.py
+++ b/data/textdata/textdata/spiders/qplum.py
@@ -36,10 +36,13 @@ def parse(self, response):
         paragraph = re.sub("&.....;", ' ', paragraph)
         paragraph = re.sub("&....;", ' ', paragraph)
 
+        # Replace 'U.S.' with 'US':
+        paragraph = paragraph.replace('U.S.', 'US')
+
         # Some more replacements to improve the default tokenization
         for c in ['\n', '\r', '\t']:
             paragraph = paragraph.replace(c, ' ')
-        for c in '();.,[]"\'-:/%$+':
+        for c in '();.,[]"\'-:/%$+@':
             paragraph = paragraph.replace(c, ' {} '.format(c))
 
         filename = 'qplum_data.txt'
diff --git a/data/textdata/textdata/spiders/wikipedia.py b/data/textdata/textdata/spiders/wikipedia.py
index 2ed10c0..7f6963e 100644
--- a/data/textdata/textdata/spiders/wikipedia.py
+++ b/data/textdata/textdata/spiders/wikipedia.py
@@ -51,7 +51,7 @@ def _parse_topic_response(self, response):
             paragraph = paragraph.replace('U.S.', 'US')
 
             # Some more replacements to improve the default tokenization
-            for c in '();.,[]"\'-:/%$+':
+            for c in '();.,[]"\'-:/%$+@':
                 paragraph = paragraph.replace(c, ' {} '.format(c))
 
             # Add to the file