diff --git a/src/main/kotlin/com/chimbori/crux/Crux.kt b/src/main/kotlin/com/chimbori/crux/Crux.kt index c271c8b..4c49994 100644 --- a/src/main/kotlin/com/chimbori/crux/Crux.kt +++ b/src/main/kotlin/com/chimbori/crux/Crux.kt @@ -6,6 +6,7 @@ import com.chimbori.crux.api.Resource import com.chimbori.crux.api.Rewriter import com.chimbori.crux.common.CHROME_USER_AGENT import com.chimbori.crux.plugins.AmpRedirector +import com.chimbori.crux.plugins.DocumentFetcher import com.chimbori.crux.plugins.FacebookUrlRewriter import com.chimbori.crux.plugins.FaviconExtractor import com.chimbori.crux.plugins.GoogleUrlRewriter @@ -23,7 +24,7 @@ import org.jsoup.nodes.Document * choose from the set of available default plugins to create their own configuration. */ public fun createDefaultPlugins(okHttpClient: OkHttpClient): List = listOf( - // Rewriters + // Rewriters ---------------------------------------------------------------- // Static redirectors go first, to avoid getting stuck into CAPTCHAs. GoogleUrlRewriter(), @@ -31,10 +32,14 @@ public fun createDefaultPlugins(okHttpClient: OkHttpClient): List = list // Remove any tracking parameters remaining. TrackingParameterRemover(), - // Extractors + // Fetcher ------------------------------------------------------------------ + // Fetches the Web page, so this must be the first [Extractor]. + DocumentFetcher(okHttpClient), - // Parses many standard HTML metadata attributes. Fetches the Web page, so this must be the first [Extractor]. - HtmlMetadataExtractor(okHttpClient), + // Extractors --------------------------------------------------------------- + + // Parses many standard HTML metadata attributes. + HtmlMetadataExtractor(), // Prefer canonical URLs over AMP URLs. AmpRedirector(refetchContentFromCanonicalUrl = true, okHttpClient), // Fetches and parses the Web Manifest. May replace existing favicon URL with one from the manifest.json. diff --git a/src/main/kotlin/com/chimbori/crux/plugins/DocumentFetcher.kt b/src/main/kotlin/com/chimbori/crux/plugins/DocumentFetcher.kt new file mode 100644 index 0000000..ce0f76a --- /dev/null +++ b/src/main/kotlin/com/chimbori/crux/plugins/DocumentFetcher.kt @@ -0,0 +1,39 @@ +package com.chimbori.crux.plugins + +import com.chimbori.crux.api.Extractor +import com.chimbori.crux.api.Fields.CANONICAL_URL +import com.chimbori.crux.api.Resource +import com.chimbori.crux.common.fetchFromUrl +import com.chimbori.crux.common.isLikelyArticle +import com.chimbori.crux.extractors.extractCanonicalUrl +import okhttp3.HttpUrl +import okhttp3.OkHttpClient + +/** + * Fetches an HTML document from a remote URL, if not already fetched. + * If a parsed JSoup Document is already available, this is a no-op. + */ +public class DocumentFetcher(private val okHttpClient: OkHttpClient) : Extractor { + /** Skip handling any file extensions that are unlikely to be HTML pages. */ + public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() + + override suspend fun extract(request: Resource): Resource { + val resourceToUse = if (request.document != null) { + request + } else if (request.url != null) { + Resource.fetchFromUrl(request.url, okHttpClient) + } else { + Resource() + } + + val canonicalUrl = resourceToUse.document?.extractCanonicalUrl() + ?.let { resourceToUse.url?.resolve(it) } + ?: resourceToUse.url + + return Resource( + url = canonicalUrl, + document = resourceToUse.document, + metadata = mapOf(CANONICAL_URL to canonicalUrl) + ).removeNullValues() + } +} diff --git a/src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt b/src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt index 9bccea5..69c225a 100644 --- a/src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt +++ b/src/main/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractor.kt @@ -16,7 +16,6 @@ import com.chimbori.crux.api.Fields.THEME_COLOR_HEX import com.chimbori.crux.api.Fields.TITLE import com.chimbori.crux.api.Fields.VIDEO_URL import com.chimbori.crux.api.Resource -import com.chimbori.crux.common.fetchFromUrl import com.chimbori.crux.common.isLikelyArticle import com.chimbori.crux.extractors.extractAmpUrl import com.chimbori.crux.extractors.extractCanonicalUrl @@ -32,7 +31,6 @@ import com.chimbori.crux.extractors.extractThemeColor import com.chimbori.crux.extractors.extractTitle import com.chimbori.crux.extractors.extractVideoUrl import okhttp3.HttpUrl -import okhttp3.OkHttpClient /** * Extracts common well-defined metadata fields from an HTML DOM tree. Includes support for: @@ -40,41 +38,33 @@ import okhttp3.OkHttpClient * - Open Graph Protocol: https://ogp.me/ * - AMP Spec: https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/ */ -public class HtmlMetadataExtractor(private val okHttpClient: OkHttpClient) : Extractor { +public class HtmlMetadataExtractor : Extractor { /** Skip handling any file extensions that are unlikely to be HTML pages. */ public override fun canExtract(url: HttpUrl): Boolean = url.isLikelyArticle() override suspend fun extract(request: Resource): Resource { - val resourceToUse = if (request.document != null) { - request - } else if (request.url != null) { - Resource.fetchFromUrl(request.url, okHttpClient) - } else { - Resource() - } - - val canonicalUrl = resourceToUse.document?.extractCanonicalUrl() - ?.let { resourceToUse.url?.resolve(it) } - ?: resourceToUse.url + val canonicalUrl = request.document?.extractCanonicalUrl() + ?.let { request.url?.resolve(it) } + ?: request.url return Resource( url = canonicalUrl, - document = resourceToUse.document, + document = request.document, metadata = mapOf( CANONICAL_URL to canonicalUrl, - TITLE to resourceToUse.document?.extractTitle(), - DESCRIPTION to resourceToUse.document?.extractDescription(), - SITE_NAME to resourceToUse.document?.extractSiteName(), - THEME_COLOR_HEX to resourceToUse.document?.extractThemeColor(), - PUBLISHED_AT to resourceToUse.document?.extractPublishedAt(), - MODIFIED_AT to resourceToUse.document?.extractModifiedAt(), - KEYWORDS_CSV to resourceToUse.document?.extractKeywords()?.joinToString(separator = ","), - NEXT_PAGE_URL to resourceToUse.document?.extractPaginationUrl(resourceToUse.url, "next"), - PREVIOUS_PAGE_URL to resourceToUse.document?.extractPaginationUrl(resourceToUse.url, "prev"), - BANNER_IMAGE_URL to resourceToUse.document?.extractImageUrl(canonicalUrl), - FEED_URL to resourceToUse.document?.extractFeedUrl(canonicalUrl), - AMP_URL to resourceToUse.document?.extractAmpUrl(canonicalUrl), - VIDEO_URL to resourceToUse.document?.extractVideoUrl(canonicalUrl), + TITLE to request.document?.extractTitle(), + DESCRIPTION to request.document?.extractDescription(), + SITE_NAME to request.document?.extractSiteName(), + THEME_COLOR_HEX to request.document?.extractThemeColor(), + PUBLISHED_AT to request.document?.extractPublishedAt(), + MODIFIED_AT to request.document?.extractModifiedAt(), + KEYWORDS_CSV to request.document?.extractKeywords()?.joinToString(separator = ","), + NEXT_PAGE_URL to request.document?.extractPaginationUrl(request.url, "next"), + PREVIOUS_PAGE_URL to request.document?.extractPaginationUrl(request.url, "prev"), + BANNER_IMAGE_URL to request.document?.extractImageUrl(canonicalUrl), + FEED_URL to request.document?.extractFeedUrl(canonicalUrl), + AMP_URL to request.document?.extractAmpUrl(canonicalUrl), + VIDEO_URL to request.document?.extractVideoUrl(canonicalUrl), ) ).removeNullValues() } diff --git a/src/test/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractorTest.kt b/src/test/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractorTest.kt index cf00a89..af7af1e 100644 --- a/src/test/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractorTest.kt +++ b/src/test/kotlin/com/chimbori/crux/plugins/HtmlMetadataExtractorTest.kt @@ -28,7 +28,7 @@ class HtmlMetadataExtractorTest { @Before fun setUp() { mockWebServer = MockWebServer().apply { start() } - htmlMetadataExtractor = HtmlMetadataExtractor(loggingOkHttpClient) + htmlMetadataExtractor = HtmlMetadataExtractor() } @After