Skip to content

Commit

Permalink
feat(sync): add sync lol_html rewriter
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 2, 2024
1 parent f7b8b06 commit 7e1eea1
Show file tree
Hide file tree
Showing 15 changed files with 1,091 additions and 112 deletions.
71 changes: 70 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ edition = "2021"

[dependencies]
criterion = { version = "0.5", features = ["html_reports", "async_tokio"] }
fast_html2md = { path = "../fast_html2md", version = "0" }
fast_html2md = { path = "../fast_html2md", version = "0", features = ["tokio"] }
tokio = { version = "1", features = [ "full" ] }

[[bench]]
name = "parse"
Expand Down
14 changes: 13 additions & 1 deletion benches/parse.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use html2md::{parse_html, rewrite_html};
use html2md::{parse_html, rewrite_html, rewrite_html_streaming};
use std::fs::File;
use std::io::Read;

Expand All @@ -23,6 +23,12 @@ pub fn bench_speed(c: &mut Criterion) {
b.iter(|| black_box(rewrite_html(&html, false)))
});

group.bench_function(format!("Async real-world-1: {}", sample_title), |b| {
let rt = tokio::runtime::Runtime::new().unwrap();
b.to_async(rt)
.iter(|| async { black_box(rewrite_html_streaming(&html, false).await) });
});

let path = std::path::Path::new("../test-samples/wiki/en-wikipedia-org_wiki_Cat.html");

let mut html = String::new();
Expand All @@ -37,6 +43,12 @@ pub fn bench_speed(c: &mut Criterion) {
b.iter(|| black_box(rewrite_html(&html, false)))
});

group.bench_function(format!("Async Scraper wiki-cat: {}", sample_title), |b| {
let rt = tokio::runtime::Runtime::new().unwrap();
b.to_async(rt)
.iter(|| async { black_box(rewrite_html_streaming(&html, false).await) });
});

group.finish();
}

Expand Down
9 changes: 8 additions & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.39"
version = "0.0.40"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand All @@ -21,8 +21,15 @@ percent-encoding = "2"
auto_encoder = "0"
url = "2"
lol_html = "2"
tokio = { version = "1", features = ["sync"], optional = true }
tokio-stream = { version = "0.1", optional = true }

[dev-dependencies]
spectral = "0.6.0"
pretty_assertions = "0.7.2"
indoc = "1.0.3"
tokio = { version = "1", features = ["full"] }

[features]
default = []
tokio = ["dep:tokio", "dep:tokio-stream"]
96 changes: 96 additions & 0 deletions fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
}

/// Main function of this library async streaming. Rewrites incoming HTML, converts it into Markdown
/// and returns converted string. Incomplete work in progress for major performance increases.
/// # Arguments
/// `html` is source HTML as `String`
#[cfg(feature = "tokio")]
pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
.await
.unwrap_or_default()
}

/// Custom variant of rewrite function.
///
/// You can also override standard tag handlers this way
Expand All @@ -40,6 +51,7 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String {
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
#[cfg(feature = "tokio")]
pub fn rewrite_html_custom_with_url(
html: &str,
custom: &Option<HashSet<String>>,
Expand All @@ -49,6 +61,50 @@ pub fn rewrite_html_custom_with_url(
rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
}

/// Custom variant of rewrite function.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
/// `chunk_size` the chunk size to use.
#[cfg(feature = "tokio")]
pub async fn rewrite_html_custom_with_url_and_chunk(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
chunk_size: usize,
) -> String {
rewriter::writer::convert_html_to_markdown_send_with_size(
html, &custom, commonmark, url, chunk_size,
)
.await
.unwrap_or_default()
}

/// Custom variant of rewrite function streaming async.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
#[cfg(feature = "tokio")]
pub async fn rewrite_html_custom_with_url_streaming(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
.await
.unwrap_or_default()
}

/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
Expand All @@ -62,3 +118,43 @@ pub fn clean_markdown(input: &str) -> String {
pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
input.sift_bytes()
}

/// Replace the markdown chars cleanly.
pub fn replace_markdown_chars(input: &str) -> String {
use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;

if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
return input.to_string();
}

let mut output = String::new();
let mut chars = input.chars().peekable();

while let Some(ch) = chars.next() {
if ch == '&' {
let mut entity = String::new();
entity.push(ch);
while let Some(&next_ch) = chars.peek() {
entity.push(next_ch);
chars.next();
if entity == "&nbsp;" {
entity.clear(); // discard &nbsp;
break;
} else if next_ch == ';' || entity.len() > 6 {
output.push_str(&entity);
break;
}
}
if !entity.is_empty() {
output.push_str(&entity);
}
} else if "<>*\\_~".contains(ch) {
output.push('\\');
output.push(ch);
} else {
output.push(ch);
}
}

output
}
36 changes: 36 additions & 0 deletions fast_html2md/src/rewriter/anchors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,39 @@ pub(crate) fn rewrite_anchor_element(
}
Ok(())
}

/// Rewrite the anchor.
pub(crate) fn rewrite_anchor_element_send(
el: &mut lol_html::send::Element,
_commonmark: bool,
url: &Option<Url>,
) -> Result<(), std::io::Error> {
if let Some(href) = el.get_attribute("href") {
let decoded_url: Cow<'_, str> = percent_decode_str(&href).decode_utf8_lossy();

let resolved_url = if decoded_url.starts_with('/') {
match &url {
Some(url) => {
if let Ok(u) = url.join(&decoded_url) {
u.to_string()
} else {
decoded_url.to_string()
}
}
None => decoded_url.to_string(),
}
} else {
decoded_url.to_string()
};

let markdown_url = if resolved_url.contains(|c: char| c.is_ascii_control() || c == ' ') {
Cow::Owned(format!("<{}>", resolved_url))
} else {
Cow::Borrowed(&resolved_url)
};

el.before("[", Html);
el.after(&format!("]({})", markdown_url), Html);
}
Ok(())
}
Loading

0 comments on commit 7e1eea1

Please sign in to comment.