Fixes for better tokenization of words

hardikp · Apr 27, 2017 · d83de07 · d83de07
1 parent 6afecba
commit d83de07
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 2 deletions.
diff --git a/data/textdata/textdata/spiders/investopedia.py b/data/textdata/textdata/spiders/investopedia.py
@@ -0,0 +1,85 @@
+from string import ascii_lowercase
+
+import scrapy
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class InvestopediaSpider(CrawlSpider):
+    name = 'investopedia'
+    start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']
+
+    def parse(self, response):
+        """
+        Parse the response page
+        """
+        url = response.url
+
+        # 'terms' has to be there in the URL to proceed further
+        if 'terms' not in url:
+            return
+
+        # if the url ends with '.asp', then that's a topic page
+        if url.endswith('.asp'):
+            return self._parse_topic_response(response)
+
+        # Otherwise, assume that this a list page
+        return self._parse_topic_list(response)
+
+    def _parse_topic_response(self, response):
+        """
+        Parses various topics
+        e.g. www.investopedia.com/terms/o/oddlottheory.asp
+        """
+        # Get the title first
+        title = response.css('title::text').extract_first()
+
+        # Replace / with a space - creates issues with writing to file
+        title = title.replace('/', ' ')
+
+        # Get the first div with id Content
+        content = response.css('div#Content')[0]
+        content = content.css('div.content-box')
+
+        text = ''
+        for child in content.xpath('//p'):
+
+            # Get the text from this child <p></p> tag
+            paragraph = child.extract()
+
+            # Remove tags including <p> and <a>
+            paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()
+
+            # Replace '&amp;' with '&'
+            paragraph = paragraph.replace('&amp;', '&')
+
+            # Replace 'U.S.' with 'US':
+            paragraph = paragraph.replace('U.S.', 'US')
+
+            # Some more replacements to improve the default tokenization
+            for c in '();.,[]"\'-:/%$+@':
+                paragraph = paragraph.replace(c, ' {} '.format(c))
+
+            # Add to the file
+            text += paragraph + '\n'
+
+        # Save the title and the text both
+        filename = 'investopedia_data.txt'
+        f = open(filename, 'a')
+        f.write(text)
+        f.close()
+
+    def _parse_topic_list(self, response):
+        """
+        Parse the page with the topics listed out
+        e.g. www.investopedia.com/terms/o/
+        """
+        list_element = response.css('ol.list')
+
+        # Iterate through the list of topics
+        for l in list_element.css('li'):
+            # Extract the URL
+            url = l.css('a::attr(href)').extract_first()
+
+            next_page = response.urljoin(url)
+            yield scrapy.Request(next_page, callback=self.parse)
diff --git a/data/textdata/textdata/spiders/qplum.py b/data/textdata/textdata/spiders/qplum.py
@@ -36,10 +36,13 @@ def parse(self, response):
         paragraph = re.sub("&.....;", ' ', paragraph)
         paragraph = re.sub("&....;", ' ', paragraph)
 
+        # Replace 'U.S.' with 'US':
+        paragraph = paragraph.replace('U.S.', 'US')
+
         # Some more replacements to improve the default tokenization
         for c in ['\n', '\r', '\t']:
             paragraph = paragraph.replace(c, ' ')
-        for c in '();.,[]"\'-:/%$+':
+        for c in '();.,[]"\'-:/%$+@':
             paragraph = paragraph.replace(c, ' {} '.format(c))
 
         filename = 'qplum_data.txt'

diff --git a/data/textdata/textdata/spiders/wikipedia.py b/data/textdata/textdata/spiders/wikipedia.py
@@ -51,7 +51,7 @@ def _parse_topic_response(self, response):
             paragraph = paragraph.replace('U.S.', 'US')
 
             # Some more replacements to improve the default tokenization
-            for c in '();.,[]"\'-:/%$+':
+            for c in '();.,[]"\'-:/%$+@':
                 paragraph = paragraph.replace(c, ' {} '.format(c))
 
             # Add to the file