Skip to content

Commit

Permalink
don't download non-HTML pages (#798)
Browse files Browse the repository at this point in the history
* don't download non-HTML pages

* fix timeouts and HTTP headers
  • Loading branch information
JaneJeon authored Aug 19, 2023
1 parent 19304d6 commit 5258166
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 16 deletions.
9 changes: 7 additions & 2 deletions __utils__/mock-http-requests.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ nock.enableNetConnect('127.0.0.1')

// Mock HTTP timeouts
;['https://timeout.com', 'www.timeout.com'].forEach(url => {
nock(url).get('/').delay(1000000).reply(200, '<html></html>').persist()
nock(url)
.get('/')
.delay(1_000_000)
.reply(200, '<html></html>', { 'Content-Type': 'text/html' })
.persist()
})

// For a couple of "stock" websites, prevent actually hitting them
Expand All @@ -31,7 +35,8 @@ nock.enableNetConnect('127.0.0.1')
<body>
<p>Hello!</p>
</body>
</html>`
</html>`,
{ 'Content-Type': 'text/html' }
)
.persist()
})
42 changes: 42 additions & 0 deletions lib/got.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
const got = require('got')

module.exports = got.default.extend({
handlers: [
(options, next) => {
const promiseOrStream = next(options)

// A destroy function that supports both promises and streams.
// For newer versions, we could use abortcontroller, but alas...
const destroy = message => {
if (options.isStream) {
promiseOrStream.destroy(message)
return
}

// Also note that got v11 is a fucking troll and won't actually pass on the cancellation reason.
promiseOrStream.cancel(message)
}

promiseOrStream.on('response', response => {
const contentType = response.headers['content-type']

// The goal is to not download *anything* if it's not HTML,
// not only because we can't get metadata from non-HTML responses,
// but also because non-HTML responses may cause us to download some gigantic payload.
if (contentType && contentType.startsWith('text/html')) {
options.context.requestLogger.info(
`Received an HTML page. Returning response as-is.`
)
return
}

options.context.requestLogger.info(
`Received a non-HTML response. Aborting early.`
)
destroy('Not an HTML response')
})

return promiseOrStream
}
]
})
26 changes: 13 additions & 13 deletions lib/scrape.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
const got = require('got')
const ms = require('ms')
const metascraper = require('metascraper')([
require('metascraper-author')(),
Expand All @@ -11,7 +10,8 @@ const metascraper = require('metascraper')([
require('metascraper-title')()
])
const httpError = require('http-errors')
const log = require('./logger')
const got = require('./got')
const logger = require('./logger')

// const nock = require('nock')
// nock.disableNetConnect()
Expand All @@ -25,25 +25,25 @@ const log = require('./logger')
const timeoutMs = ms(process.env.LINK_TIMEOUT)

module.exports = async url => {
log.info(`Scraping %s for metadata...`, url)
const requestLogger = logger.child({ url })
requestLogger.info(`Scraping %s for metadata...`, url)

try {
const promise = got(url, {
timeout: { request: timeoutMs }
const { body: html, url: finalUrl } = await got(url, {
// Got is fucking stupid and this is the only way we can actually get the fucking timeouts to work.
timeout: { socket: timeoutMs, request: timeoutMs },
context: { requestLogger }
})
// TODO: just rely on got's built-in timeout once got v12 comes out
setTimeout(() => {
// At the moment, got's timeout doesn't work for shit
promise.cancel()
}, timeoutMs)

const { body: html, url: finalUrl } = await promise
return metascraper({ html, url: finalUrl })
} catch (err) {
if (err.name === 'RequestError' && err.code === 'ENOTFOUND')
throw httpError(404, 'The address to shorten does not exist!')
if (err.name === 'CancelError')
if (err.name === 'TimeoutError')
throw httpError(504, 'Could not scrape link in time!')
// If we were able to reach an actual thing at the other end,
// but the request got canceled because it's not an HTML,
// we don't care about it as we cannot get any useful metadata from the response.
if (err.name === 'CancelError') return null
else throw err
}
}
3 changes: 2 additions & 1 deletion models/link.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ class Link extends hashId(BaseModel) {
await super.$beforeInsert(queryContext)

// update metadata by visiting the URL
this.meta = merge(await scrape(this.originalUrl), this.meta)
const scrapedMetadata = await scrape(this.originalUrl)
this.meta = merge({}, this.meta, scrapedMetadata)
}

static get virtualAttributes() {
Expand Down

1 comment on commit 5258166

@vercel
Copy link

@vercel vercel bot commented on 5258166 Aug 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

blink – ./

blink-janejeon.vercel.app
blink-git-master-janejeon.vercel.app
docs.blink.rest

Please sign in to comment.