Skip to content

Commit

Permalink
Merge pull request #57 from michaelmcmillan/wikipedia_component
Browse files Browse the repository at this point in the history
Wikipedia component
  • Loading branch information
michaelmcmillan committed Apr 28, 2015
2 parents 39d7eb5 + 9bbe20b commit 9ade343
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 24 deletions.
60 changes: 60 additions & 0 deletions parsers/wikipedia/wikiparser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
var cheerio = require('cheerio');

function WikiParser (html) {

var self = this;
var html = html || '';
var $ = cheerio.load(html);
var liTags;

this.getArticleHTML = function () {
return html;
}

this.parseLiTags = function () {
liTags = $('li[id^="cite_note-"]');
}

this.getLiTags = function () {
return liTags;
}

this.extractURLFromLiTag = function (liTag) {
var hrefTag = $(liTag).find('a.external');
return hrefTag.attr('href');
}


this.extractAccessedDateFromLiTag = function (liTag) {
var accessDateTag = $('span.reference-accessdate');
return accessDateTag.text();
}

this.extractTextFromLiTag = function (liTag) {
var textTag = $('span.reference-text');
var text = textTag.text().trim();

var accessDate = this.extractAccessedDateFromLiTag(liTag);
if (text.indexOf(accessDate) !== -1) {
text = text.replace(accessDate, '');
}

return text;
}

this.getCitation = function (index) {
var currentLiTag = liTags[index];

if (currentLiTag == undefined)
throw new Error('Fant ingen li-tags med den indexen');

return {
url: this.extractURLFromLiTag (currentLiTag),
text: this.extractTextFromLiTag(currentLiTag)
}
}

this.parseLiTags();
}

module.exports = WikiParser;
26 changes: 2 additions & 24 deletions parsers/wikipedia/wikipedia.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,13 @@ function Wikipedia () {
apiArguments + pageTitle;

reqOptions.url = apiURL;
console.log(apiURL);
request.get(reqOptions, function (err, data) {
if (err) throw err;

var text = data.body;
var refTags = self.parseRefTags(text);

refTags.each(function (index, refTag) {

var tagData = refTag.children[0].data;
if (tagData === undefined) return;

var matches = tagData.match(/{{(.*?)}}/);
if (matches === null) return;

var urlMatch = urlRegexp.match(matches[0])[0];
if (urlMatch !== undefined) {
urlMatch = self.stripPipe(urlMatch);
wikiReferences.websites.push(urlMatch);
}

var isbnMatch = matches[0].match(/\|isbn=([0-9|-]*).*?/im);
if (isbnMatch !== null)
wikiReferences.books.push(self.stripPipe(isbnMatch[1]));
});

callback(wikiReferences);
callback(undefined, text);
});
}

Expand Down Expand Up @@ -108,7 +89,4 @@ function Wikipedia () {
}
}

//var wikipedia = new Wikipedia();
//wikipedia.getReferences('http://no.wikipedia.org/wiki/Jens_Stoltenberg');

module.exports = Wikipedia;
70 changes: 70 additions & 0 deletions test/unit/wikipedia_tagparser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
var assert = require('assert');
var WikiParser = require('../../parsers/wikipedia/wikiparser.js');

describe('Wikipedia citeparser', function () {

it('should take in wikipedia articles in html in the constructor', function () {
var wikiparser = new WikiParser('<html>Wikipedia article</html>');
assert.equal(wikiparser.getArticleHTML(), '<html>Wikipedia article</html>');
});

it('should parse tags with an id that starts with "cite_note-"', function () {
var wikiparser = new WikiParser('<html><li id="cite_note-1">This is a reference</li></html>');
assert.equal(wikiparser.getLiTags().length, 1);
});

it('should return a length of 0 when suppliying an undefined as html', function () {
var wikiparser = new WikiParser(undefined);
assert.equal(wikiparser.getLiTags().length, 0);
});

it('should not parse tags without id that starts with "cite_note-"', function () {
var wikiparser = new WikiParser('<html><li id="citation-yall">This is a reference</li></html>');
assert.equal(wikiparser.getLiTags().length, 0);
});

it('should parse url from citation tag if it contains a link with "external" class', function () {
var wikiHtml = '<li id="cite_note-9"><b><a href="#cite_ref-9">^</a></b>' +
'<span class="reference-text"><a rel="nofollow" class="external text" ' +
'href="http://www.ssb.no/emner/02/01/10/innvbef/">'+
'SSB: Rekordstor vekst i innvandrerbefolkningen</a></span></li>';
var wikiparser = new WikiParser(wikiHtml);
assert.equal(wikiparser.getCitation(0).url, 'http://www.ssb.no/emner/02/01/10/innvbef/');
});

it('should should return the reference text from the li tag if present', function () {
var wikiHtml = '<li id="cite_note-9"><b><a href="#cite_ref-9">^</a></b>' +
'<span class="reference-text"><a rel="nofollow" class="external text" ' +
'href="http://www.ssb.no/emner/02/01/10/innvbef/">'+
'SSB: Rekordstor vekst i innvandrerbefolkningen</a></span></li>';
var wikiparser = new WikiParser(wikiHtml);
assert.equal(wikiparser.getCitation(0).text, 'SSB: Rekordstor vekst i innvandrerbefolkningen');
});

it('should not include a reference date text when extracting the text from a li tag', function () {
var wikiHtml = '<li id="cite_note-19"><b><a href="#cite_ref-19">^</a></b>' +
'<span class="reference-text"><span class="citation web">Statistisk sentralbyrå ' +
'(9. april 2015). <a rel="nofollow" class="external text" ' +
'href="http://www.ssb.no/223508/tettsteder.folkemengde-og-areal-etter-kommune.1.januar-2014">' +
'«Tettsteder. Folkemengde og areal, etter kommune.»</a><span class="reference-accessdate">. ' +
'Besøkt 11. april 2015</span>.</span><span title="ctx_ver=" class="Z3988">' +
'<span style="display:none;">&nbsp;</span></span></span></li>';
var wikiparser = new WikiParser(wikiHtml);
assert.equal(wikiparser.getCitation(0).text, 'Statistisk sentralbyrå (9. april 2015). «Tettsteder. Folkemengde og areal, etter kommune.».');
});

it('should not matter what language the content is in when extracting text', function () {
var wikiHtml = '<li id="cite_note-79"><span class="mw-cite-backlink">' +
'<b><a href="#cite_ref-79">^</a>' +
'</b></span> <span class="reference-text"><span class="citation web">' +
'<a rel="nofollow" class="external text" ' +
'href="http://www.bbc.co.uk/history/british/victorians/foundling_01.shtml">' +
'"The Foundling Hospital"</a>. BBC History. 17 February 2011' +
'<span class="reference-accessdate">. Retrieved <span class="nowrap">' +
'13 December</span> 2011</span>.</span><span title="ctx_ver=" class="Z3988">' +
'<span style="display:none;">&nbsp;</span></span></span></li>';
var wikiparser = new WikiParser(wikiHtml);
assert.equal(wikiparser.getCitation(0).url, 'http://www.bbc.co.uk/history/british/victorians/foundling_01.shtml');
assert.equal(wikiparser.getCitation(0).text, '"The Foundling Hospital". BBC History. 17 February 2011.');
});
});

0 comments on commit 9ade343

Please sign in to comment.