From b34740507f6b590b3b3100518c02335a196b987a Mon Sep 17 00:00:00 2001 From: Michael McMillan Date: Wed, 15 Jul 2015 10:36:32 +0200 Subject: [PATCH 1/3] fixed bug where exception would be thrown when calling .getTitle on website without title --- models/website.js | 14 +++++++++----- test/unit/website.js | 7 +++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/models/website.js b/models/website.js index ca19a62..b13e022 100644 --- a/models/website.js +++ b/models/website.js @@ -10,10 +10,12 @@ function Website (url, title) { var publicationDate; this.getHostname = function () { - var hostname = validator.parse(url).hostname; - return hostname.charAt(0).toUpperCase() + - hostname.slice(1) - .substring(0, hostname.lastIndexOf('.') - 1); + if (url !== undefined) { + var hostname = validator.parse(url).hostname; + return hostname.charAt(0).toUpperCase() + + hostname.slice(1) + .substring(0, hostname.lastIndexOf('.') - 1); + } } this.addAuthor = function (author) { @@ -66,8 +68,10 @@ function Website (url, title) { this.getTitle = function () { if (title !== undefined) return title; - else + else if (this.getHostname() !== undefined) return this.getHostname(); + else + return undefined; } } diff --git a/test/unit/website.js b/test/unit/website.js index ee958ee..b30405f 100644 --- a/test/unit/website.js +++ b/test/unit/website.js @@ -55,4 +55,11 @@ describe('website', function () { website.setPublicationDate(new Date(2015, 5, 1)); assert.equal(website.getHumanfriendlyPublicationDate(), '1. juni 2015'); }); + + it('should not crash when calling .getTitle on a website without title', function () { + assert.doesNotThrow(function () { + var website = new Website('https://slettmeg.no/om-oss'); + website.getTitle(); + }); + }); }); From f61e367a0f65568e7acd03ce36b655a44adaf8c8 Mon Sep 17 00:00:00 2001 From: Michael McMillan Date: Wed, 15 Jul 2015 10:37:02 +0200 Subject: [PATCH 2/3] added controller and funnel for studieweb.net --- controllers/query/funnel.js | 7 ++++++ controllers/query/studieweb.js | 46 ++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 controllers/query/studieweb.js diff --git a/controllers/query/funnel.js b/controllers/query/funnel.js index 1e3dbfd..1c79b7a 100644 --- a/controllers/query/funnel.js +++ b/controllers/query/funnel.js @@ -13,6 +13,9 @@ var SNL = require('../../parsers/snl/snl.js'); var NDLAController = require('./ndla.js'); var NDLA = require('../../parsers/ndla/ndla.js'); +var StudiewebController = require('./studieweb.js'); +var Studieweb = require('../../parsers/studieweb/studieweb.js'); + var ReadabilityController = require('./readability.js'); var Readability = require('../../parsers/readability/readability.js'); @@ -30,6 +33,7 @@ function FunnelController (req, res, next) { // External components we can query var readability = new Readability(); var wikipedia = new Wikipedia(); + var studieweb = new Studieweb(); var bibsys = new Bibsys(); var ndla = new NDLA(); var snl = new SNL(); @@ -41,6 +45,9 @@ function FunnelController (req, res, next) { else if (ndla.isNDLAURL(req.query.q)) NDLAController(req, res, next); + else if (studieweb.isStudiewebURL(req.query.q)) + StudiewebController(req, res, next); + else if (readability.isURL(req.query.q)) ReadabilityController(req, res, next); diff --git a/controllers/query/studieweb.js b/controllers/query/studieweb.js new file mode 100644 index 0000000..5e5d058 --- /dev/null +++ b/controllers/query/studieweb.js @@ -0,0 +1,46 @@ +var config = require('../../config.js'); +var logger = require('../../log/logger.js'); +var Studieweb = require('../../parsers/studieweb/studieweb.js'); +var QueryFactory = require('../../database/factories/query.js'); +var WebsiteFactory = require('../../database/factories/website.js'); +var ListFactory = require('../../database/factories/list.js'); +var ResultController = require('./result.js'); + +function StudiewebController (req, res, next) { + var queryString = req.query.q; + var studieweb = new Studieweb(); + + QueryFactory.read(queryString, 'website', function (err, cachedWebsites) { + if (err) return next(err); + + // If the cache returned website lets not ask Studieweb + if (cachedWebsites.length > 0) { + logger.log('debug', 'Found Studieweb cache with %d website for "%s"', + cachedWebsites.length, queryString); + + ResultController(cachedWebsites, false, req, res, next); + + // Empty cache means we ask Studieweb + } else { + logger.profile('Studieweb query'); + + studieweb.search(queryString, function (err, website) { + if (err) return next(err); + + logger.profile('Studieweb query'); + logger.log('debug', 'Studieweb returned result'); + + // Store all the websites + WebsiteFactory.create(website, function (err, createdWebsite) { + if (err) return next(err); + logger.log('debug', 'Created website in the database'); + + // Cache the results to the query string + ResultController([createdWebsite], true, req, res, next); + }); + }); + } + }); +} + +module.exports = StudiewebController; From abab45fc8ef1898f285e317fd9abab78d429f02d Mon Sep 17 00:00:00 2001 From: Michael McMillan Date: Wed, 15 Jul 2015 10:37:18 +0200 Subject: [PATCH 3/3] added external component for studieweb.net (parser) --- parsers/studieweb/studieweb.js | 68 ++++++++++++++++++++++++++++++++++ test/unit/studieweb.js | 44 ++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 parsers/studieweb/studieweb.js create mode 100644 test/unit/studieweb.js diff --git a/parsers/studieweb/studieweb.js b/parsers/studieweb/studieweb.js new file mode 100644 index 0000000..dfa6810 --- /dev/null +++ b/parsers/studieweb/studieweb.js @@ -0,0 +1,68 @@ +var config = require('../../config.js'); +var URLParser = require('url'); +var querystring = require('querystring'); +var request = require('request'); +var cheerio = require('cheerio'); +var moment = require('moment'); +var Website = require('../../models/website.js'); +var Author = require('../../models/author.js'); + +function Studieweb () { + + var self = this; + var options = { + 'User-Agent': config.crawlers.useragent, + followAllRedirects: true, + maxRedirects: 2, + encoding: null + } + + this.isStudiewebURL = function (url) { + return (url.indexOf('studieweb.net') !== -1); + } + + this.parse = function (nodeHTML) { + + var node = new Website(); + var $ = cheerio.load(nodeHTML); + + // Extracts the title of the node from opengraph and capitalizes it + var ogTitle = $('meta[property="og:title"]'); + var title = ogTitle.attr('content'); + + if (title !== undefined) { + + // Capitalize the first char of title + title = title.charAt(0).toUpperCase() + title.slice(1); + + // Removes the "- Studieweb.net" suffix from the title + var suffix = ' - Studieweb.net'; + if (title.indexOf(suffix, title.length - suffix.length) !== -1) + title = title.substring(0, title.length - suffix.length); + + // Finally set the title + node.setTitle(title); + } + + // Finally return the constructed node (or website if you will) + return node; + } + + this.search = function (url, done) { + options.url = url; + request.get(options, function (err, response, data) { + if (err) return done(err); + if ([404, 501].indexOf(response.statusCode) !== -1) + return done(new Error('Siden finnes ikke på Studieweb.net')); + + var website = self.parse(data); + + // Set the url to be the the provided url + website.setURL(url); + + done(undefined, website); + }); + } +} + +module.exports = Studieweb; diff --git a/test/unit/studieweb.js b/test/unit/studieweb.js new file mode 100644 index 0000000..d0faba9 --- /dev/null +++ b/test/unit/studieweb.js @@ -0,0 +1,44 @@ +var assert = require('assert'); +var Studieweb = require('../../parsers/studieweb/studieweb.js'); + +describe('Studieweb', function () { + + it('should pass if url is from studieweb', function () { + var studieweb = new Studieweb(); + assert.equal(studieweb.isStudiewebURL('http://studieweb.net/hva-er-metodelaere/'), true); + }); + + it('should not pass if url is not from studieweb', function () { + var studieweb = new Studieweb(); + assert.equal(studieweb.isStudiewebURL('http://snl.no'), false); + }); + + it('should pass if url is from studieweb.no without protocol', function () { + var studieweb = new Studieweb(); + assert.equal(studieweb.isStudiewebURL('studieweb.net/sosiologi-og-sosialantropologi/'), true); + }); + + it('should extract title from the html of a resource', function () { + var studieweb = new Studieweb(); + var node = studieweb.parse(''); + assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi'); + }); + + it('should format the title with capital letter', function () { + var studieweb = new Studieweb(); + var node = studieweb.parse(''); + assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi'); + }); + + it('should filter out the suffix "- Studieweb.net" from the title tag', function () { + var studieweb = new Studieweb(); + var node = studieweb.parse(''); + assert.equal(node.getTitle(), 'Sosiologi og sosialantropologi'); + }); + + it('should return undefined when calling .getTitle when no title is set', function () { + var studieweb = new Studieweb(); + var node = studieweb.parse(''); + assert.equal(node.getTitle(), undefined); + }); +});