Skip to content

Commit

Permalink
Merge pull request #94 from michaelmcmillan/studieweb.net
Browse files Browse the repository at this point in the history
Studieweb.net supported.
  • Loading branch information
michaelmcmillan committed Jul 15, 2015
2 parents 675c4f6 + abab45f commit 0ba392c
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 5 deletions.
7 changes: 7 additions & 0 deletions controllers/query/funnel.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ var SNL = require('../../parsers/snl/snl.js');
var NDLAController = require('./ndla.js');
var NDLA = require('../../parsers/ndla/ndla.js');

var StudiewebController = require('./studieweb.js');
var Studieweb = require('../../parsers/studieweb/studieweb.js');

var ReadabilityController = require('./readability.js');
var Readability = require('../../parsers/readability/readability.js');

Expand All @@ -30,6 +33,7 @@ function FunnelController (req, res, next) {
// External components we can query
var readability = new Readability();
var wikipedia = new Wikipedia();
var studieweb = new Studieweb();
var bibsys = new Bibsys();
var ndla = new NDLA();
var snl = new SNL();
Expand All @@ -41,6 +45,9 @@ function FunnelController (req, res, next) {
else if (ndla.isNDLAURL(req.query.q))
NDLAController(req, res, next);

else if (studieweb.isStudiewebURL(req.query.q))
StudiewebController(req, res, next);

else if (readability.isURL(req.query.q))
ReadabilityController(req, res, next);

Expand Down
46 changes: 46 additions & 0 deletions controllers/query/studieweb.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
var config = require('../../config.js');
var logger = require('../../log/logger.js');
var Studieweb = require('../../parsers/studieweb/studieweb.js');
var QueryFactory = require('../../database/factories/query.js');
var WebsiteFactory = require('../../database/factories/website.js');
var ListFactory = require('../../database/factories/list.js');
var ResultController = require('./result.js');

function StudiewebController (req, res, next) {
var queryString = req.query.q;
var studieweb = new Studieweb();

QueryFactory.read(queryString, 'website', function (err, cachedWebsites) {
if (err) return next(err);

// If the cache returned website lets not ask Studieweb
if (cachedWebsites.length > 0) {
logger.log('debug', 'Found Studieweb cache with %d website for "%s"',
cachedWebsites.length, queryString);

ResultController(cachedWebsites, false, req, res, next);

// Empty cache means we ask Studieweb
} else {
logger.profile('Studieweb query');

studieweb.search(queryString, function (err, website) {
if (err) return next(err);

logger.profile('Studieweb query');
logger.log('debug', 'Studieweb returned result');

// Store all the websites
WebsiteFactory.create(website, function (err, createdWebsite) {
if (err) return next(err);
logger.log('debug', 'Created website in the database');

// Cache the results to the query string
ResultController([createdWebsite], true, req, res, next);
});
});
}
});
}

module.exports = StudiewebController;
14 changes: 9 additions & 5 deletions models/website.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@ function Website (url, title) {
var publicationDate;

this.getHostname = function () {
var hostname = validator.parse(url).hostname;
return hostname.charAt(0).toUpperCase() +
hostname.slice(1)
.substring(0, hostname.lastIndexOf('.') - 1);
if (url !== undefined) {
var hostname = validator.parse(url).hostname;
return hostname.charAt(0).toUpperCase() +
hostname.slice(1)
.substring(0, hostname.lastIndexOf('.') - 1);
}
}

this.addAuthor = function (author) {
Expand Down Expand Up @@ -66,8 +68,10 @@ function Website (url, title) {
this.getTitle = function () {
if (title !== undefined)
return title;
else
else if (this.getHostname() !== undefined)
return this.getHostname();
else
return undefined;
}
}

Expand Down
68 changes: 68 additions & 0 deletions parsers/studieweb/studieweb.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
var config = require('../../config.js');
var URLParser = require('url');
var querystring = require('querystring');
var request = require('request');
var cheerio = require('cheerio');
var moment = require('moment');
var Website = require('../../models/website.js');
var Author = require('../../models/author.js');

function Studieweb () {

var self = this;
var options = {
'User-Agent': config.crawlers.useragent,
followAllRedirects: true,
maxRedirects: 2,
encoding: null
}

this.isStudiewebURL = function (url) {
return (url.indexOf('studieweb.net') !== -1);
}

this.parse = function (nodeHTML) {

var node = new Website();
var $ = cheerio.load(nodeHTML);

// Extracts the title of the node from opengraph and capitalizes it
var ogTitle = $('meta[property="og:title"]');
var title = ogTitle.attr('content');

if (title !== undefined) {

// Capitalize the first char of title
title = title.charAt(0).toUpperCase() + title.slice(1);

// Removes the "- Studieweb.net" suffix from the title
var suffix = ' - Studieweb.net';
if (title.indexOf(suffix, title.length - suffix.length) !== -1)
title = title.substring(0, title.length - suffix.length);

// Finally set the title
node.setTitle(title);
}

// Finally return the constructed node (or website if you will)
return node;
}

this.search = function (url, done) {
options.url = url;
request.get(options, function (err, response, data) {
if (err) return done(err);
if ([404, 501].indexOf(response.statusCode) !== -1)
return done(new Error('Siden finnes ikke på Studieweb.net'));

var website = self.parse(data);

// Set the url to be the the provided url
website.setURL(url);

done(undefined, website);
});
}
}

module.exports = Studieweb;
44 changes: 44 additions & 0 deletions test/unit/studieweb.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
var assert = require('assert');
var Studieweb = require('../../parsers/studieweb/studieweb.js');

describe('Studieweb', function () {

it('should pass if url is from studieweb', function () {
var studieweb = new Studieweb();
assert.equal(studieweb.isStudiewebURL('http://studieweb.net/hva-er-metodelaere/'), true);
});

it('should not pass if url is not from studieweb', function () {
var studieweb = new Studieweb();
assert.equal(studieweb.isStudiewebURL('http://snl.no'), false);
});

it('should pass if url is from studieweb.no without protocol', function () {
var studieweb = new Studieweb();
assert.equal(studieweb.isStudiewebURL('studieweb.net/sosiologi-og-sosialantropologi/'), true);
});

it('should extract title from the html of a resource', function () {
var studieweb = new Studieweb();
var node = studieweb.parse('<meta property="og:title" content="Sosiologi og sosialantropologi" />');
assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi');
});

it('should format the title with capital letter', function () {
var studieweb = new Studieweb();
var node = studieweb.parse('<meta property="og:title" content="sosiologi og sosialantropologi" />');
assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi');
});

it('should filter out the suffix "- Studieweb.net" from the title tag', function () {
var studieweb = new Studieweb();
var node = studieweb.parse('<meta property="og:title" content="Sosiologi og sosialantropologi - Studieweb.net" />');
assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi');
});

it('should return undefined when calling .getTitle when no title is set', function () {
var studieweb = new Studieweb();
var node = studieweb.parse('<meta property="og:non-title" content="Sosiologi og sosialantropologi - Studieweb.net" />');
assert.equal(node.getTitle(), undefined);
});
});
7 changes: 7 additions & 0 deletions test/unit/website.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,11 @@ describe('website', function () {
website.setPublicationDate(new Date(2015, 5, 1));
assert.equal(website.getHumanfriendlyPublicationDate(), '1. juni 2015');
});

it('should not crash when calling .getTitle on a website without title', function () {
assert.doesNotThrow(function () {
var website = new Website('https://slettmeg.no/om-oss');
website.getTitle();
});
});
});

0 comments on commit 0ba392c

Please sign in to comment.