Skip to content

Commit

Permalink
Split document fetching responsibilities out of `HtmlMetadataExtracto…
Browse files Browse the repository at this point in the history
…r` into a new class, `DocumentFetcher`
  • Loading branch information
chimbori committed Dec 26, 2024
1 parent ce0ee17 commit d0a8814
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 33 deletions.
13 changes: 9 additions & 4 deletions src/main/kotlin/com/chimbori/crux/Crux.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import com.chimbori.crux.api.Resource
import com.chimbori.crux.api.Rewriter
import com.chimbori.crux.common.CHROME_USER_AGENT
import com.chimbori.crux.plugins.AmpRedirector
import com.chimbori.crux.plugins.DocumentFetcher
import com.chimbori.crux.plugins.FacebookUrlRewriter
import com.chimbori.crux.plugins.FaviconExtractor
import com.chimbori.crux.plugins.GoogleUrlRewriter
Expand All @@ -23,18 +24,22 @@ import org.jsoup.nodes.Document
* choose from the set of available default plugins to create their own configuration.
*/
public fun createDefaultPlugins(okHttpClient: OkHttpClient): List<Plugin> = listOf(
// Rewriters
// Rewriters ----------------------------------------------------------------

// Static redirectors go first, to avoid getting stuck into CAPTCHAs.
GoogleUrlRewriter(),
FacebookUrlRewriter(),
// Remove any tracking parameters remaining.
TrackingParameterRemover(),

// Extractors
// Fetcher ------------------------------------------------------------------
// Fetches the Web page, so this must be the first [Extractor].
DocumentFetcher(okHttpClient),

// Parses many standard HTML metadata attributes. Fetches the Web page, so this must be the first [Extractor].
HtmlMetadataExtractor(okHttpClient),
// Extractors ---------------------------------------------------------------

// Parses many standard HTML metadata attributes.
HtmlMetadataExtractor(),
// Prefer canonical URLs over AMP URLs.
AmpRedirector(refetchContentFromCanonicalUrl = true, okHttpClient),
// Fetches and parses the Web Manifest. May replace existing favicon URL with one from the manifest.json.
Expand Down
39 changes: 39 additions & 0 deletions src/main/kotlin/com/chimbori/crux/plugins/DocumentFetcher.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.chimbori.crux.plugins

import com.chimbori.crux.api.Extractor
import com.chimbori.crux.api.Fields.CANONICAL_URL
import com.chimbori.crux.api.Resource
import com.chimbori.crux.common.fetchFromUrl
import com.chimbori.crux.common.isLikelyArticle
import com.chimbori.crux.extractors.extractCanonicalUrl
import okhttp3.HttpUrl
import okhttp3.OkHttpClient

/**
* Fetches an HTML document from a remote URL, if not already fetched.
* If a parsed JSoup Document is already available, this is a no-op.
*/
public class DocumentFetcher(private val okHttpClient: OkHttpClient) : Extractor {
/** Skip handling any file extensions that are unlikely to be HTML pages. */
public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle()

override suspend fun extract(request: Resource): Resource {
val resourceToUse = if (request.document != null) {
request
} else if (request.url != null) {
Resource.fetchFromUrl(request.url, okHttpClient)
} else {
Resource()
}

val canonicalUrl = resourceToUse.document?.extractCanonicalUrl()
?.let { resourceToUse.url?.resolve(it) }
?: resourceToUse.url

return Resource(
url = canonicalUrl,
document = resourceToUse.document,
metadata = mapOf(CANONICAL_URL to canonicalUrl)
).removeNullValues()
}
}
46 changes: 18 additions & 28 deletions src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import com.chimbori.crux.api.Fields.THEME_COLOR_HEX
import com.chimbori.crux.api.Fields.TITLE
import com.chimbori.crux.api.Fields.VIDEO_URL
import com.chimbori.crux.api.Resource
import com.chimbori.crux.common.fetchFromUrl
import com.chimbori.crux.common.isLikelyArticle
import com.chimbori.crux.extractors.extractAmpUrl
import com.chimbori.crux.extractors.extractCanonicalUrl
Expand All @@ -32,49 +31,40 @@ import com.chimbori.crux.extractors.extractThemeColor
import com.chimbori.crux.extractors.extractTitle
import com.chimbori.crux.extractors.extractVideoUrl
import okhttp3.HttpUrl
import okhttp3.OkHttpClient

/**
* Extracts common well-defined metadata fields from an HTML DOM tree. Includes support for:
* - Twitter Cards Metadata: https://developer.twitter.com/en/docs/twitter-for-websites/cards/overview/markup
* - Open Graph Protocol: https://ogp.me/
* - AMP Spec: https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/
*/
public class HtmlMetadataExtractor(private val okHttpClient: OkHttpClient) : Extractor {
public class HtmlMetadataExtractor : Extractor {
/** Skip handling any file extensions that are unlikely to be HTML pages. */
public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle()

override suspend fun extract(request: Resource): Resource {
val resourceToUse = if (request.document != null) {
request
} else if (request.url != null) {
Resource.fetchFromUrl(request.url, okHttpClient)
} else {
Resource()
}

val canonicalUrl = resourceToUse.document?.extractCanonicalUrl()
?.let { resourceToUse.url?.resolve(it) }
?: resourceToUse.url
val canonicalUrl = request.document?.extractCanonicalUrl()
?.let { request.url?.resolve(it) }
?: request.url

return Resource(
url = canonicalUrl,
document = resourceToUse.document,
document = request.document,
metadata = mapOf(
CANONICAL_URL to canonicalUrl,
TITLE to resourceToUse.document?.extractTitle(),
DESCRIPTION to resourceToUse.document?.extractDescription(),
SITE_NAME to resourceToUse.document?.extractSiteName(),
THEME_COLOR_HEX to resourceToUse.document?.extractThemeColor(),
PUBLISHED_AT to resourceToUse.document?.extractPublishedAt(),
MODIFIED_AT to resourceToUse.document?.extractModifiedAt(),
KEYWORDS_CSV to resourceToUse.document?.extractKeywords()?.joinToString(separator = ","),
NEXT_PAGE_URL to resourceToUse.document?.extractPaginationUrl(resourceToUse.url, "next"),
PREVIOUS_PAGE_URL to resourceToUse.document?.extractPaginationUrl(resourceToUse.url, "prev"),
BANNER_IMAGE_URL to resourceToUse.document?.extractImageUrl(canonicalUrl),
FEED_URL to resourceToUse.document?.extractFeedUrl(canonicalUrl),
AMP_URL to resourceToUse.document?.extractAmpUrl(canonicalUrl),
VIDEO_URL to resourceToUse.document?.extractVideoUrl(canonicalUrl),
TITLE to request.document?.extractTitle(),
DESCRIPTION to request.document?.extractDescription(),
SITE_NAME to request.document?.extractSiteName(),
THEME_COLOR_HEX to request.document?.extractThemeColor(),
PUBLISHED_AT to request.document?.extractPublishedAt(),
MODIFIED_AT to request.document?.extractModifiedAt(),
KEYWORDS_CSV to request.document?.extractKeywords()?.joinToString(separator = ","),
NEXT_PAGE_URL to request.document?.extractPaginationUrl(request.url, "next"),
PREVIOUS_PAGE_URL to request.document?.extractPaginationUrl(request.url, "prev"),
BANNER_IMAGE_URL to request.document?.extractImageUrl(canonicalUrl),
FEED_URL to request.document?.extractFeedUrl(canonicalUrl),
AMP_URL to request.document?.extractAmpUrl(canonicalUrl),
VIDEO_URL to request.document?.extractVideoUrl(canonicalUrl),
)
).removeNullValues()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class HtmlMetadataExtractorTest {
@Before
fun setUp() {
mockWebServer = MockWebServer().apply { start() }
htmlMetadataExtractor = HtmlMetadataExtractor(loggingOkHttpClient)
htmlMetadataExtractor = HtmlMetadataExtractor()
}

@After
Expand Down

0 comments on commit d0a8814

Please sign in to comment.