diff --git a/Cargo.lock b/Cargo.lock index 8aaab3d..16e96bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -99,6 +99,7 @@ version = "0.0.0" dependencies = [ "criterion", "fast_html2md", + "tokio", ] [[package]] @@ -125,6 +126,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bytes" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" + [[package]] name = "cast" version = "0.3.0" @@ -374,7 +381,7 @@ dependencies = [ [[package]] name = "fast_html2md" -version = "0.0.39" +version = "0.0.40" dependencies = [ "auto_encoder", "html5ever", @@ -386,6 +393,8 @@ dependencies = [ "pretty_assertions", "regex", "spectral", + "tokio", + "tokio-stream", "url", ] @@ -855,6 +864,17 @@ dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.52.0", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -1505,6 +1525,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + [[package]] name = "siphasher" version = "0.3.11" @@ -1517,6 +1546,16 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "socket2" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "spectral" version = "0.6.0" @@ -1655,7 +1694,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" dependencies = [ "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "tokio-stream" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", ] [[package]] diff --git a/benches/Cargo.toml b/benches/Cargo.toml index dae08e3..7bf2bb9 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -6,7 +6,8 @@ edition = "2021" [dependencies] criterion = { version = "0.5", features = ["html_reports", "async_tokio"] } -fast_html2md = { path = "../fast_html2md", version = "0" } +fast_html2md = { path = "../fast_html2md", version = "0", features = ["tokio"] } +tokio = { version = "1", features = [ "full" ] } [[bench]] name = "parse" diff --git a/benches/parse.rs b/benches/parse.rs index e034533..2aa2440 100644 --- a/benches/parse.rs +++ b/benches/parse.rs @@ -1,5 +1,5 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use html2md::{parse_html, rewrite_html}; +use html2md::{parse_html, rewrite_html, rewrite_html_streaming}; use std::fs::File; use std::io::Read; @@ -23,6 +23,12 @@ pub fn bench_speed(c: &mut Criterion) { b.iter(|| black_box(rewrite_html(&html, false))) }); + group.bench_function(format!("Async real-world-1: {}", sample_title), |b| { + let rt = tokio::runtime::Runtime::new().unwrap(); + b.to_async(rt) + .iter(|| async { black_box(rewrite_html_streaming(&html, false).await) }); + }); + let path = std::path::Path::new("../test-samples/wiki/en-wikipedia-org_wiki_Cat.html"); let mut html = String::new(); @@ -37,6 +43,12 @@ pub fn bench_speed(c: &mut Criterion) { b.iter(|| black_box(rewrite_html(&html, false))) }); + group.bench_function(format!("Async Scraper wiki-cat: {}", sample_title), |b| { + let rt = tokio::runtime::Runtime::new().unwrap(); + b.to_async(rt) + .iter(|| async { black_box(rewrite_html_streaming(&html, false).await) }); + }); + group.finish(); } diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml index 10ae564..be27812 100644 --- a/fast_html2md/Cargo.toml +++ b/fast_html2md/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.39" +version = "0.0.40" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] @@ -21,8 +21,15 @@ percent-encoding = "2" auto_encoder = "0" url = "2" lol_html = "2" +tokio = { version = "1", features = ["sync"], optional = true } +tokio-stream = { version = "0.1", optional = true } [dev-dependencies] spectral = "0.6.0" pretty_assertions = "0.7.2" indoc = "1.0.3" +tokio = { version = "1", features = ["full"] } + +[features] +default = [] +tokio = ["dep:tokio", "dep:tokio-stream"] diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs index ed6812b..85d239a 100644 --- a/fast_html2md/src/lib.rs +++ b/fast_html2md/src/lib.rs @@ -32,6 +32,17 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String { rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default() } +/// Main function of this library async streaming. Rewrites incoming HTML, converts it into Markdown +/// and returns converted string. Incomplete work in progress for major performance increases. +/// # Arguments +/// `html` is source HTML as `String` +#[cfg(feature = "tokio")] +pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String { + rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None) + .await + .unwrap_or_default() +} + /// Custom variant of rewrite function. /// /// You can also override standard tag handlers this way @@ -40,6 +51,7 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String { /// `custom` is custom tag hadler producers for tags you want, can be empty /// `commonmark` is for adjusting markdown output to commonmark /// `url` is used to provide absolute url handling +#[cfg(feature = "tokio")] pub fn rewrite_html_custom_with_url( html: &str, custom: &Option>, @@ -49,6 +61,50 @@ pub fn rewrite_html_custom_with_url( rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default() } +/// Custom variant of rewrite function. +/// +/// You can also override standard tag handlers this way +/// # Arguments +/// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty +/// `commonmark` is for adjusting markdown output to commonmark +/// `url` is used to provide absolute url handling +/// `chunk_size` the chunk size to use. +#[cfg(feature = "tokio")] +pub async fn rewrite_html_custom_with_url_and_chunk( + html: &str, + custom: &Option>, + commonmark: bool, + url: &Option, + chunk_size: usize, +) -> String { + rewriter::writer::convert_html_to_markdown_send_with_size( + html, &custom, commonmark, url, chunk_size, + ) + .await + .unwrap_or_default() +} + +/// Custom variant of rewrite function streaming async. +/// +/// You can also override standard tag handlers this way +/// # Arguments +/// `html` is source HTML as `String` +/// `custom` is custom tag hadler producers for tags you want, can be empty +/// `commonmark` is for adjusting markdown output to commonmark +/// `url` is used to provide absolute url handling +#[cfg(feature = "tokio")] +pub async fn rewrite_html_custom_with_url_streaming( + html: &str, + custom: &Option>, + commonmark: bool, + url: &Option, +) -> String { + rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url) + .await + .unwrap_or_default() +} + /// Called after all processing has been finished /// /// Clears excessive punctuation that would be trimmed by renderer anyway @@ -62,3 +118,43 @@ pub fn clean_markdown(input: &str) -> String { pub fn clean_markdown_bytes(input: &Vec) -> String { input.sift_bytes() } + +/// Replace the markdown chars cleanly. +pub fn replace_markdown_chars(input: &str) -> String { + use crate::MARKDOWN_MIDDLE_KEYCHARS_SET; + + if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) { + return input.to_string(); + } + + let mut output = String::new(); + let mut chars = input.chars().peekable(); + + while let Some(ch) = chars.next() { + if ch == '&' { + let mut entity = String::new(); + entity.push(ch); + while let Some(&next_ch) = chars.peek() { + entity.push(next_ch); + chars.next(); + if entity == " " { + entity.clear(); // discard   + break; + } else if next_ch == ';' || entity.len() > 6 { + output.push_str(&entity); + break; + } + } + if !entity.is_empty() { + output.push_str(&entity); + } + } else if "<>*\\_~".contains(ch) { + output.push('\\'); + output.push(ch); + } else { + output.push(ch); + } + } + + output +} diff --git a/fast_html2md/src/rewriter/anchors.rs b/fast_html2md/src/rewriter/anchors.rs index 7a48198..f5a804e 100644 --- a/fast_html2md/src/rewriter/anchors.rs +++ b/fast_html2md/src/rewriter/anchors.rs @@ -38,3 +38,39 @@ pub(crate) fn rewrite_anchor_element( } Ok(()) } + +/// Rewrite the anchor. +pub(crate) fn rewrite_anchor_element_send( + el: &mut lol_html::send::Element, + _commonmark: bool, + url: &Option, +) -> Result<(), std::io::Error> { + if let Some(href) = el.get_attribute("href") { + let decoded_url: Cow<'_, str> = percent_decode_str(&href).decode_utf8_lossy(); + + let resolved_url = if decoded_url.starts_with('/') { + match &url { + Some(url) => { + if let Ok(u) = url.join(&decoded_url) { + u.to_string() + } else { + decoded_url.to_string() + } + } + None => decoded_url.to_string(), + } + } else { + decoded_url.to_string() + }; + + let markdown_url = if resolved_url.contains(|c: char| c.is_ascii_control() || c == ' ') { + Cow::Owned(format!("<{}>", resolved_url)) + } else { + Cow::Borrowed(&resolved_url) + }; + + el.before("[", Html); + el.after(&format!("]({})", markdown_url), Html); + } + Ok(()) +} diff --git a/fast_html2md/src/rewriter/handle.rs b/fast_html2md/src/rewriter/handle.rs index 09f6484..d91b123 100644 --- a/fast_html2md/src/rewriter/handle.rs +++ b/fast_html2md/src/rewriter/handle.rs @@ -1,29 +1,20 @@ -use super::anchors::rewrite_anchor_element; -use super::iframes::handle_iframe; -use super::images::rewrite_image_element; -use super::lists::handle_list_or_item; -use super::quotes::rewrite_blockquote_element; -use super::styles::rewrite_style_element; +use super::anchors::{rewrite_anchor_element, rewrite_anchor_element_send}; +use super::iframes::{handle_iframe, handle_iframe_send}; +use super::images::{rewrite_image_element, rewrite_image_element_send}; +use super::lists::{handle_list_or_item, handle_list_or_item_send}; +use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_element_send}; +use super::styles::{rewrite_style_element, rewrite_style_element_send}; +use super::{ + insert_newline_after, insert_newline_after_send, insert_newline_before, + insert_newline_before_send, +}; use lol_html::html_content::ContentType::{Html, Text}; use lol_html::html_content::Element; -use lol_html::{doc_comments, doctype, text}; -use lol_html::{element, RewriteStrSettings}; use std::cell::RefCell; use std::rc::Rc; +use std::sync::{Arc, RwLock}; use url::Url; -/// Insert a new line after -#[inline] -pub fn insert_newline_after(element: &mut Element) { - element.after("\n", Text); -} - -/// Insert a new line before -#[inline] -pub fn insert_newline_before(element: &mut Element) { - element.before("\n", Text); -} - /// Handle the lol_html tag. #[inline] pub fn handle_tag( @@ -104,18 +95,17 @@ pub fn handle_tag( insert_newline_after(element); } "th" => { + // add the first table row start + if *inside_table.borrow() { + element.before("|", Html); + *inside_table.borrow_mut() = false; + } if commonmark { element.before("** ", Html); element.after("** |", Html); } else { element.after("|", Html); } - - // add the first table row start - if *inside_table.borrow() { - element.before("|", Html); - *inside_table.borrow_mut() = false; - } } "td" => { element.after("|", Html); @@ -149,3 +139,134 @@ pub fn handle_tag( Ok(()) } + +/// Handle the lol_html tag. +#[inline] +pub fn handle_tag_send( + element: &mut lol_html::send::Element, + commonmark: bool, + url: &Option, + list_type: Arc>>, + order_counter: Arc>, + quote_depth: Arc>, + inside_table: Arc>, +) -> Result<(), Box> { + let element_name = element.tag_name(); + + let remove_attrs = + commonmark && (element_name.as_str() == "sub" || element_name.as_str() == "sup"); + + // check common mark includes. + if remove_attrs { + let attrs = element + .attributes() + .iter() + .map(|f| f.name()) + .collect::>(); + + for attr in attrs.iter() { + element.remove_attribute(&attr); + } + } else { + element.remove_and_keep_content(); + } + + // Add the markdown equivalents before the element. + match element_name.as_str() { + "h1" => { + element.before("# ", Text); + insert_newline_after_send(element); + } + "h2" => { + element.before("## ", Text); + insert_newline_after_send(element); + } + "h3" => { + element.before("### ", Text); + insert_newline_after_send(element); + } + "h4" => { + element.before("#### ", Text); + insert_newline_after_send(element); + } + "h5" => { + element.before("##### ", Text); + insert_newline_after_send(element); + } + "h6" => { + element.before("###### ", Text); + insert_newline_after_send(element); + } + "p" => { + insert_newline_before_send(element); + insert_newline_after_send(element); + } + "hr" => { + insert_newline_before_send(element); + element.append("---", Text); + insert_newline_after_send(element); + } + "br" => insert_newline_after_send(element), + "a" => { + let _ = rewrite_anchor_element_send(element, commonmark, url); + } + "img" => { + let _ = rewrite_image_element_send(element, commonmark, &url); + } + "table" => { + if let Ok(mut d) = inside_table.write() { + *d = true + } + } + "tr" => { + insert_newline_after_send(element); + } + "th" => { + if let Ok(inside) = inside_table.read() { + if *inside { + drop(inside); + element.before("|", Html); + if let Ok(mut d) = inside_table.write() { + *d = false + } + } + } + if commonmark { + element.before("** ", Html); + element.after("** |", Html); + } else { + element.after("|", Html); + } + } + "td" => { + element.after("|", Html); + } + "iframe" => { + let _ = handle_iframe_send(element); + } + "b" | "i" | "s" | "strong" | "em" | "del" => { + let _ = rewrite_style_element_send(element); + } + "ol" | "ul" | "menu" | "li" => { + let _ = handle_list_or_item_send(element, list_type.clone(), order_counter.clone()); + } + "q" | "cite" | "blockquote" => { + let _ = rewrite_blockquote_element_send(element, quote_depth.clone()); + } + "div" | "section" | "header" | "footer" => { + insert_newline_before_send(element); + insert_newline_after_send(element); + } + "pre" => { + element.before("\n```\n", Html); + element.after("\n```\n", Html); + } + "code" | "samp" => { + element.before("`", Html); + element.after("`", Html); + } + _ => (), + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/iframes.rs b/fast_html2md/src/rewriter/iframes.rs index 36beaab..efc0a31 100644 --- a/fast_html2md/src/rewriter/iframes.rs +++ b/fast_html2md/src/rewriter/iframes.rs @@ -38,3 +38,40 @@ pub(crate) fn handle_iframe( Ok(()) } + +/// Handle the conversion to iframes. +pub(crate) fn handle_iframe_send( + element: &mut lol_html::send::Element, +) -> Result<(), Box> { + if let Some(src) = element.get_attribute("src") { + if let Some(capture) = YOUTUBE_PATTERN.captures(&src) { + let media_id = capture.get(1).map_or("", |m| m.as_str()); + element.replace( + &format!("[![Embedded YouTube video](https://img.youtube.com/vi/{}/0.jpg)](https://www.youtube.com/watch?v={})", media_id, media_id), + Text + ); + return Ok(()); + } + + if let Some(capture) = INSTAGRAM_PATTERN.captures(&src) { + let media_id = capture.get(1).map_or("", |m| m.as_str()); + element.replace( + &format!("[![Embedded Instagram post](https://www.instagram.com/p/{}/media/?size=m)](https://www.instagram.com/p/{}/embed/)", media_id, media_id), + Text + ); + return Ok(()); + } + + if let Some(capture) = VK_PATTERN.captures(&src) { + let owner_id = capture.get(1).map_or("", |m| m.as_str()); + let video_id = capture.get(2).map_or("", |m| m.as_str()); + element.replace( + &&format!("[![Embedded VK video](https://st.vk.com/images/icons/video_empty_2x.png)](https://vk.com/video{oid}_{vid})", oid = owner_id, vid = video_id), + Text, + ); + return Ok(()); + } + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/images.rs b/fast_html2md/src/rewriter/images.rs index ffd181d..1eaab85 100644 --- a/fast_html2md/src/rewriter/images.rs +++ b/fast_html2md/src/rewriter/images.rs @@ -71,3 +71,71 @@ pub(crate) fn rewrite_image_element( Ok(()) } + +/// Rewrite the image. +pub(crate) fn rewrite_image_element_send( + el: &mut lol_html::send::Element, + commonmark: bool, + url: &Option, +) -> Result<(), std::io::Error> { + let src = el.get_attribute("src").unwrap_or_default(); + let alt = el.get_attribute("alt").unwrap_or_default(); + let title = el.get_attribute("title").unwrap_or_else(|| "".to_string()); + + let height = el.get_attribute("height"); + let width = el.get_attribute("width"); + let align = el.get_attribute("align"); + + if commonmark && (height.is_some() || width.is_some() || align.is_some()) { + let mut img_tag = format!("\"{}\"","); + el.set_inner_content(&img_tag, lol_html::html_content::ContentType::Html); + } else { + let mut img_url = if src.contains(' ') { + utf8_percent_encode(&src, FRAGMENT).to_string() + } else { + src.clone() + }; + + if img_url.starts_with('/') { + if let Some(ref u) = url { + if let Ok(n) = u.join(&img_url) { + img_url = n.to_string(); + } + } + } + + el.replace( + &format!( + "![{}]({}{})", + alt, + img_url, + if !title.is_empty() { + format!(" \"{}\"", title) + } else { + "".to_string() + } + ), + lol_html::html_content::ContentType::Html, + ); + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/lists.rs b/fast_html2md/src/rewriter/lists.rs index ec62fde..da12f18 100644 --- a/fast_html2md/src/rewriter/lists.rs +++ b/fast_html2md/src/rewriter/lists.rs @@ -3,6 +3,8 @@ use lol_html::html_content::ContentType; use lol_html::html_content::Element; use std::cell::RefCell; use std::rc::Rc; +use std::sync::Arc; +use std::sync::RwLock; // Function to handle list elements and items #[inline] @@ -33,3 +35,52 @@ pub(crate) fn handle_list_or_item( Ok(()) } + +// Function to handle list elements and items +#[inline] +pub(crate) fn handle_list_or_item_send( + element: &mut lol_html::send::Element, + list_type: Arc>>, + order_counter: Arc>, +) -> Result<(), Box> { + match element.tag_name().as_str() { + "ul" | "menu" => { + if let Ok(mut list_type) = list_type.write() { + *list_type = Some("ul".to_string()); + } + if let Ok(mut order_counter) = order_counter.write() { + order_counter.reset(); + } + } + "ol" => { + if let Ok(mut list_type) = list_type.write() { + *list_type = Some("ol".to_string()); + } + if let Ok(mut order_counter) = order_counter.write() { + order_counter.reset(); + } + } + "li" => { + let ordered: bool = if let Ok(list_type) = list_type.read() { + list_type.as_deref().eq(&Some("ol")) + } else { + false + }; + + if ordered { + let order = if let Ok(mut order_counter) = order_counter.write() { + order_counter.increment() + } else { + 0 + }; + + element.before(&format!("\n{}. ", order), ContentType::Text); + } else { + element.before("\n* ", ContentType::Text); + } + } + _ => (), + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/mod.rs b/fast_html2md/src/rewriter/mod.rs index 1a15ca5..e73ccfe 100644 --- a/fast_html2md/src/rewriter/mod.rs +++ b/fast_html2md/src/rewriter/mod.rs @@ -6,5 +6,28 @@ pub(crate) mod images; pub(crate) mod lists; pub(crate) mod quotes; pub(crate) mod styles; - pub mod writer; + +/// Insert a new line after +#[inline] +pub(crate) fn insert_newline_after(element: &mut lol_html::html_content::Element) { + element.after("\n", lol_html::html_content::ContentType::Text); +} + +/// Insert a new line before +#[inline] +pub(crate) fn insert_newline_before(element: &mut lol_html::html_content::Element) { + element.before("\n", lol_html::html_content::ContentType::Text); +} + +/// Insert a new line after +#[inline] +pub(crate) fn insert_newline_after_send(element: &mut lol_html::send::Element) { + element.after("\n", lol_html::html_content::ContentType::Text); +} + +/// Insert a new line before +#[inline] +pub(crate) fn insert_newline_before_send(element: &mut lol_html::send::Element) { + element.before("\n", lol_html::html_content::ContentType::Text); +} diff --git a/fast_html2md/src/rewriter/quotes.rs b/fast_html2md/src/rewriter/quotes.rs index 20f52bf..4f7a4b8 100644 --- a/fast_html2md/src/rewriter/quotes.rs +++ b/fast_html2md/src/rewriter/quotes.rs @@ -1,6 +1,7 @@ use crate::rewriter::counter::Counter; use lol_html::html_content::{ContentType, Element, TextChunk}; use std::error::Error; +use std::sync::{Arc, RwLock}; use std::{cell::RefCell, rc::Rc}; // Function to handle
elements @@ -23,6 +24,30 @@ pub(crate) fn rewrite_blockquote_element( Ok(()) } +// Function to handle
elements sync +pub(crate) fn rewrite_blockquote_element_send( + el: &mut lol_html::send::Element, + quote_depth: Arc>, +) -> Result<(), Box> { + if let Ok(mut quote_depth) = quote_depth.write() { + quote_depth.increment(); + } + + if let Some(end_tag_handlers) = el.end_tag_handlers() { + end_tag_handlers.push(Box::new({ + let quote_depth = quote_depth.clone(); + move |_end| { + if let Ok(mut quote_depth) = quote_depth.write() { + quote_depth.decrement(); + } + Ok(()) + } + })); + } + + Ok(()) +} + // Function to handle text within
elements pub(crate) fn rewrite_blockquote_text( text_chunk: &mut TextChunk<'_>, @@ -56,3 +81,40 @@ pub(crate) fn rewrite_blockquote_text( Ok(()) } + +// Function to handle text within
elements sync +pub(crate) fn rewrite_blockquote_text_send( + text_chunk: &mut TextChunk<'_>, + quote_depth: Arc>, +) -> Result<(), Box> { + let depth = match quote_depth.read() { + Ok(d) => *d, + _ => 0, + }; + let quote_prefix = "> ".repeat(depth); + let lines: Vec<&str> = text_chunk.as_str().lines().collect(); + let total_lines = lines.len(); + + let last = text_chunk.last_in_text_node(); + + let modified_text = lines + .iter() + .enumerate() + .map(|(i, line)| { + if i >= 1 && i == total_lines - 1 { + format!("{}", line) + } else { + format!("{}{}", quote_prefix, line) + } + }) + .collect::>() + .join(""); + + text_chunk.replace(&modified_text, ContentType::Html); + + if last { + text_chunk.after("\n", ContentType::Text); + } + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/styles.rs b/fast_html2md/src/rewriter/styles.rs index e2c3b63..a6d00ea 100644 --- a/fast_html2md/src/rewriter/styles.rs +++ b/fast_html2md/src/rewriter/styles.rs @@ -17,3 +17,23 @@ pub(crate) fn rewrite_style_element(el: &mut Element) -> Result<(), std::io::Err Ok(()) } + +/// Rewrite the initial elements that need extra styles. +pub(crate) fn rewrite_style_element_send( + el: &mut lol_html::send::Element, +) -> Result<(), std::io::Error> { + let tag_name = el.tag_name(); + + let mark = match tag_name.as_str() { + "b" | "strong" => "**", + "i" | "em" => "*", + "s" | "del" => "~~", + "u" | "ins" => "__", + _ => return Ok(()), // Return early if tag is not one of the specified + }; + + el.before(mark, Text); + el.after(mark, Text); + + Ok(()) +} diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs index bf21986..4fe0158 100644 --- a/fast_html2md/src/rewriter/writer.rs +++ b/fast_html2md/src/rewriter/writer.rs @@ -1,66 +1,16 @@ use super::handle::handle_tag; use super::quotes::rewrite_blockquote_text; use crate::clean_markdown_bytes; -use lol_html::html_content::ContentType::Text; -use lol_html::html_content::Element; +use crate::rewriter::handle::handle_tag_send; +use crate::rewriter::quotes::rewrite_blockquote_text_send; use lol_html::{doc_comments, doctype, text}; use lol_html::{element, RewriteStrSettings}; use std::cell::RefCell; use std::rc::Rc; +use std::sync::RwLock; +use std::sync::{Arc, Mutex}; use url::Url; -/// Insert a new line after -#[inline] -pub fn insert_newline_after(element: &mut Element) { - element.after("\n", Text); -} - -/// Insert a new line before -#[inline] -pub fn insert_newline_before(element: &mut Element) { - element.before("\n", Text); -} - -/// Replace the markdown chars cleanly. -fn replace_markdown_chars(input: &str) -> String { - use crate::MARKDOWN_MIDDLE_KEYCHARS_SET; - - if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) { - return input.to_string(); - } - - let mut output = String::new(); - let mut chars = input.chars().peekable(); - - while let Some(ch) = chars.next() { - if ch == '&' { - let mut entity = String::new(); - entity.push(ch); - while let Some(&next_ch) = chars.peek() { - entity.push(next_ch); - chars.next(); - if entity == " " { - entity.clear(); // discard   - break; - } else if next_ch == ';' || entity.len() > 6 { - output.push_str(&entity); - break; - } - } - if !entity.is_empty() { - output.push_str(&entity); - } - } else if "<>*\\_~".contains(ch) { - output.push('\\'); - output.push(ch); - } else { - output.push(ch); - } - } - - output -} - /// Get the HTML rewriter settings to convert to markdown. pub fn get_rewriter_settings( commonmark: bool, @@ -87,7 +37,7 @@ pub fn get_rewriter_settings( element_content_handlers.push(text!( "*:not(script):not(head):not(style):not(svg)", move |el| { - *el.as_mut_str() = replace_markdown_chars(el.as_str().trim().into()); + *el.as_mut_str() = crate::replace_markdown_chars(el.as_str().trim().into()); Ok(()) } )); @@ -138,6 +88,101 @@ pub fn get_rewriter_settings( } } +/// Get the HTML rewriter settings to convert to markdown sync send. +pub fn get_rewriter_settings_send( + commonmark: bool, + custom: &Option>, + url: Option, +) -> lol_html::send::Settings<'static, 'static> { + let list_type = Arc::new(RwLock::new(None::)); + let order_counter = Arc::new(RwLock::new(0)); + let quote_depth = Arc::new(RwLock::new(0)); + let inside_table = Arc::new(RwLock::new(false)); + + let quote_depth1 = quote_depth.clone(); + + let mut element_content_handlers = Vec::with_capacity( + 4 + custom + .as_ref() + .map_or(0, |c| if c.is_empty() { 0 } else { 1 }), + ); + + element_content_handlers.push(text!("blockquote, q, cite", move |el| { + let _ = rewrite_blockquote_text_send(el, quote_depth.clone()); + Ok(()) + })); + + element_content_handlers.push(text!( + "*:not(script):not(head):not(style):not(svg)", + move |el| { + *el.as_mut_str() = crate::replace_markdown_chars(el.as_str().trim().into()); + Ok(()) + } + )); + + element_content_handlers.push(element!("head, nav, script, noscript, style", |el| { + el.remove(); + Ok(()) + })); + + element_content_handlers.push(element!("*", move |el| { + let _ = handle_tag_send( + el, + commonmark, + &url, + list_type.clone(), + order_counter.clone(), + quote_depth1.clone(), + inside_table.clone(), + ); + Ok(()) + })); + + if let Some(ignore) = custom { + let ignore_handler = element!( + ignore.iter().cloned().collect::>().join(","), + |el| { + el.remove(); + Ok(()) + } + ); + + element_content_handlers.push(ignore_handler); + } + + lol_html::send::Settings { + document_content_handlers: vec![ + doc_comments!(|c| { + c.remove(); + Ok(()) + }), + doctype!(|c| { + c.remove(); + Ok(()) + }), + ], + element_content_handlers, + ..lol_html::send::Settings::new_send() + } +} + +/// Shortcut to rewrite string and encode correctly +pub(crate) fn rewrite_str<'h, 's, H: lol_html::HandlerTypes>( + html: &str, + settings: impl Into>, +) -> Result, lol_html::errors::RewritingError> { + let mut output = vec![]; + + let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| { + output.extend_from_slice(c); + }); + + rewriter.write(html.as_bytes())?; + rewriter.end()?; + + Ok(output) +} + /// Convert to markdown streaming re-writer pub(crate) fn convert_html_to_markdown( html: &str, @@ -153,19 +198,61 @@ pub(crate) fn convert_html_to_markdown( } } -/// Shortcut to rewrite string and encode correctly -pub fn rewrite_str<'h, 's, H: lol_html::HandlerTypes>( +/// Convert to markdown streaming re-writer with chunk size. +#[cfg(feature = "tokio")] +pub async fn convert_html_to_markdown_send_with_size( html: &str, - settings: impl Into>, -) -> Result, lol_html::errors::RewritingError> { - let mut output = vec![]; + custom: &Option>, + commonmark: bool, + url: &Option, + chunk_size: usize, +) -> Result> { + use tokio_stream::StreamExt; + let settings = get_rewriter_settings_send(commonmark, custom, url.clone()); + let (txx, mut rxx) = tokio::sync::mpsc::unbounded_channel(); - let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| { - output.extend_from_slice(c); + let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), |c: &[u8]| { + let _ = txx.send(c.to_vec()); }); - rewriter.write(html.as_bytes())?; - rewriter.end()?; + let html_bytes = html.as_bytes(); + let chunks = html_bytes.chunks(chunk_size); - Ok(output) + let mut stream = tokio_stream::iter(chunks).map(Ok::<&[u8], ()>); + + let mut wrote_error = false; + + while let Some(chunk) = stream.next().await { + if let Ok(chunk) = chunk { + if rewriter.write(chunk).is_err() { + wrote_error = true; + break; + } + } + } + + if !wrote_error { + let _ = rewriter.end(); + } + + drop(txx); + + let mut rewrited_bytes: Vec = Vec::new(); + + while let Some(c) = rxx.recv().await { + rewrited_bytes.extend_from_slice(&c); + } + + Ok(clean_markdown_bytes(&rewrited_bytes)) +} + +/// Convert to markdown streaming re-writer +#[cfg(feature = "tokio")] +pub async fn convert_html_to_markdown_send( + html: &str, + custom: &Option>, + commonmark: bool, + url: &Option, +) -> Result> { + convert_html_to_markdown_send_with_size(html, custom, commonmark, url, 8192).await } diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs index 75d31c2..ab7e9d0 100644 --- a/fast_html2md/tests/integration.rs +++ b/fast_html2md/tests/integration.rs @@ -2,7 +2,7 @@ extern crate spectral; // use html2md::ignore::IgnoreTagFactory; // use html2md::{parse_html, parse_html_custom, parse_html_custom_with_url}; -use html2md::{parse_html, rewrite_html}; +use html2md::{parse_html, rewrite_html, rewrite_html_streaming}; use indoc::indoc; use spectral::prelude::*; use std::collections::HashMap; @@ -62,6 +62,86 @@ fn test_real_world_wiki() -> Result<(), Box> { Ok(()) } +#[test] +fn test_real_world_wiki_rewriter() -> Result<(), Box> { + use std::error::Error; + use std::fs::{self, File}; + use std::io::{self, Read}; + use std::path::Path; + + let paths = fs::read_dir("../test-samples/wiki")?; + + fn run_parse(path: &Path) -> Result<(), Box> { + let mut html = String::new(); + let mut html_file = File::open(path)?; + html_file.read_to_string(&mut html)?; + + let result = rewrite_html(&html, false); + + if result.is_empty() { + Err(Box::new(io::Error::new( + io::ErrorKind::Other, + "Result is empty", + ))) + } else { + Ok(()) + } + } + + for entry in paths { + let path = entry?.path(); + + if path.is_file() { + match run_parse(&path) { + Ok(_) => assert!(true), + Err(_e) => assert!(false), + } + } + } + + Ok(()) +} + +#[tokio::test] +async fn test_real_world_wiki_async() -> Result<(), Box> { + use std::error::Error; + use std::fs::{self, File}; + use std::io::{self, Read}; + use std::path::Path; + + let paths = fs::read_dir("../test-samples/wiki")?; + + async fn run_parse(path: &Path) -> Result<(), Box> { + let mut html = String::new(); + let mut html_file = File::open(path)?; + html_file.read_to_string(&mut html)?; + + let result = rewrite_html_streaming(&html, false).await; + + if result.is_empty() { + Err(Box::new(io::Error::new( + io::ErrorKind::Other, + "Result is empty", + ))) + } else { + Ok(()) + } + } + + for entry in paths { + let path = entry?.path(); + + if path.is_file() { + match run_parse(&path).await { + Ok(_) => assert!(true), + Err(_e) => assert!(false), + } + } + } + + Ok(()) +} + #[test] #[ignore] fn test_real_world_ja() { @@ -74,21 +154,6 @@ fn test_real_world_ja() { assert!(!result.is_empty()); } -#[test] -#[ignore] -fn test_real_spider() { - let mut html = String::new(); - let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap(); - html_file - .read_to_string(&mut html) - .expect("File must be readable"); - let result = rewrite_html(&html, false); - assert!( - result - == r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits)\n# The Web Crawler for AI Agents and LLMs\nSpider offers the finest data collecting solution. Engineered for speed and scalability, it\nallows you to elevate your AI projects.\n[Get Started](https://spider.cloud/credits/new)View Preview\n* Basic\n* Streaming\nExample request\nPython\nJSONL\nCopy\n```\n`import requests, os, json\nheaders = {\n''Authorization '': f ''Bearer {os.getenv(""SPIDER\\_API\\_KEY "")}'',\n''Content-Type '': ''application/jsonl '',\n}\njson\\_data = {""limit "":50,""metadata "":True,""url "":""https://spider.cloud ""}\nresponse = requests.post(''https://api.spider.cloud/crawl '', headers=headers, json=json\\_data, stream=True)\nwith response as r:\nr.raise\\_for\\_status()\nfor chunk in r.iter\\_lines(\nchunk\\_size=None, decode\\_unicode=True\n):\ndata = json.loads(chunk)\nprint(data)`\n```\n[Free Trial](https://spider.cloud/credits/new?free-trial=1)\nExample Response\n## Built with the need for**Speed**\nExperience the power of**Spider**, built fully in**Rust**for\nnext-generation scalability.\n### 2.4secs\nTo crawl over 20,000 pages\n### 500-1000x\nFaster than alternatives\n### 500x\nCheaper than traditional scraping services\nBenchmarks displaying performance between Spider API request modes.\nSpider API Request Modes ·Benchmarked tailwindcss.com ·06/16/2024\n[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md)\n### Seamless Integrations\nSeamlessly integrate Spider with a wide range of platforms, ensuring data curation\nperfectly aligned with your requirements. Compatible with all major AI tools.\n[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider)\n### Concurrent Streaming\nSave time and money without having to worry about bandwidth concerns by effectively\nstreaming all the results concurrently. The latency cost that is saved becomes drastic as\nyou crawl more websites.\n### Warp Speed\nPowered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme\nworkloads. We ensure continuous maintenance and improvement for top-tier performance.\n## Kickstart Your Data Collecting Projects Today\nJumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping.\n### Performance Tuned\nSpider is written in Rust and runs in full concurrency to achieve crawling thousands of\npages in secs.\n### Multiple response formats\nGet clean and formatted markdown, HTML, or text content for fine-tuning or training AI\nmodels.\n### Caching\nFurther boost speed by caching repeated web page crawls to minimize expenses while\nbuilding.\n### Smart Mode\nSpider dynamically switches to Headless Chrome when it needs to quick.\nBeta\n### Scrape with AI\nDo custom browser scripting and data extraction using the latest AI models with no cost\nstep caching.\n### The crawler for LLMs\nDon't let crawling and scraping be the highest latency in your LLM & AI agent stack.\n### Scrape with no headaches\n* Auto Proxy rotations\n* Agent headers\n* Anti-bot detections\n* Headless chrome\n* Markdown responses\n### The Fastest Web Crawler\n* Powered by[spider-rs](https://github.com/spider-rs/spider)\n* 100,000 pages/seconds\n* Unlimited concurrency\n* Simple API\n* 50,000 RPM\n### Do more with AI\n* Browser scripting\n* Advanced extraction\n* Data pipelines\n* Ideal for LLMs and AI Agents\n* Accurate labeling\n## Achieve more with these new API features\nOur API is set to stream so you can act in realtime.\n![A user interface with a search bar containing the text "Latest sports news," a green "Submit" button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp)\n### Search\nGet access to search engine results from anywhere and easily crawl and transform pages to\nLLM-ready markdown.\n[Explore SearchRight Arrow](https://spider.cloud/docs/api#search)\n![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp)\n### Transform\nConvert raw HTML into markdown easily by using this API. Transform thousands of html pages\nin seconds.\n[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform)\n## Join the community\nBacked by a network of early advocates, contributors, and supporters.\n[GitHub discussions\nChat Icon\n](https://github.com/orgs/spider-rs/discussions)[Discord\nChat Icon\n](https://discord.spider.cloud)\n[\n![iammerrick's avatar](/img/external/iammerrick_twitter.webp)\n@iammerrick\nRust based crawler Spider is next level for crawling &scraping sites. So fast.\nTheir cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider\n](https://twitter.com/iammerrick/status/1787873425446572462)\n[\n![WilliamEspegren's avatar](/img/external/william_twitter.webp)\n@WilliamEspegren\nWeb crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor\nName a reason for me to use anything else?\ngithub.com/spider-rs/spid…\n](https://twitter.com/WilliamEspegren/status/1789419820821184764)\n[\n![gasa's avatar](/img/external/gaza_twitter.webp)\n@gasa\n@gasathenaper\nis the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant\n](https://x.com/gasathenaper/status/1810612492596383948)\n[\n![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp)\n@Ashpreet Bedi\n@ashpreetbedi\nis THE best crawler out there, give it a try\n](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw)\n[\n![Troyusrex's avatar](/img/external/troy_twitter.webp)\n@Troyusrex\nI found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week.\n](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326)\n[\n![Dify.AI's avatar](/img/external/difyai.webp)\n@Dify.AI\n🕷\u{fe0f}Spider @spider\\_rust\ncan be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context.\n](https://x.com/dify_ai/status/1818226971056243089)\n## FAQ\nFrequently asked questions about Spider.\n### What is Spider?\nSpider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown.\n### Why is my website not crawling?\nYour crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome 'to solve this issue.\n### Can you crawl all pages?\nYes, Spider accurately crawls all necessary content without needing a sitemap.\n### What formats can Spider convert web data into?\nSpider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses.\n### Is Spider suitable for large scraping projects?\nAbsolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management.\n### How can I try Spider?\nPurchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities.\n### Does it respect robots.txt?\nYes, compliance with robots.txt is default, but you can disable this if necessary.\n### Unable to get dynamic content?\nIf you are having trouble getting dynamic pages, try setting the request parameter to ""chrome ""or ""smart.""You may also need to set `disable\\_intercept` to allow third-party or external scripts to run.\n### Why is my crawl going slow?\nIf you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds.\n### Do you offer a Free Trial?\nYes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1).\n## Comprehensive Data Curation for Everyone\nTrusted by leading tech businesses worldwide to deliver accurate and insightful data solutions.\nOuter Labs\n[Zapier LogoZapier](https://zapier.com/apps/spider/integrations)\nElementus Logo\nSuper AI Logo\nLayerX Logo\nSwiss Re\nWrite Sonic Logo\nAlioth Logo\n### Next generation data for AI, scale to millions\n[Start now](https://spider.cloud/credits/new)\n### Company\n* [About](https://spider.cloud/about)\n* [Privacy](https://spider.cloud/privacy)\n* [Terms](https://spider.cloud/eula)\n* [FAQ](https://spider.cloud/faq)\n### Resources\n* [API](https://spider.cloud/docs/api)\n* [Docs](https://spider.cloud/docs/overview)\n* [Guides](https://spider.cloud/guides)\n* [Spider.rs Docs](https://docs.rs/spider/latest/spider/)\n### Services\n* [Pricing](https://spider.cloud/credits/new)\n* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping)\n[All systems normal.](https://spidercloud.statuspage.io/)\n[\nGithub LogoGitHub\n](https://github.com/spider-rs/spider)[\nDiscord LogoDiscord\n](https://discord.spider.cloud)[\nTwitter LogoTwitter\n](https://twitter.com/spider_rust)"# - ); -} - #[test] #[ignore] fn test_cheatsheet() { @@ -223,3 +288,227 @@ fn test_html_from_text_rewrite() { assert!(!result.is_empty()); } + +const SPIDER_RESULT_MD: &str = r#"To help you get started with Spider, we’ll give you $200 in credits when you spend $100.[Terms apply](https://spider.cloud/promotion-spider-credits) +# The Web Crawler for AI Agents and LLMs +Spider offers the finest data collecting solution. Engineered for speed and scalability, it +allows you to elevate your AI projects. +[Get Started](https://spider.cloud/credits/new)View Preview +* Basic +* Streaming +Example request +Python +JSONL +Copy +``` +`import requests, os, json +headers = { +''Authorization '': f ''Bearer {os.getenv(""SPIDER\_API\_KEY "")}'', +''Content-Type '': ''application/jsonl '', +} +json\_data = {""limit "":50,""metadata "":True,""url "":""https://spider.cloud ""} +response = requests.post(''https://api.spider.cloud/crawl '', headers=headers, json=json\_data, stream=True) +with response as r: +r.raise\_for\_status() +for chunk in r.iter\_lines( +chunk\_size=None, decode\_unicode=True +): +data = json.loads(chunk) +print(data)` +``` +[Free Trial](https://spider.cloud/credits/new?free-trial=1) +Example Response +## Built with the need for**Speed** +Experience the power of**Spider**, built fully in**Rust**for +next-generation scalability. +### 2.4secs +To crawl over 20,000 pages +### 500-1000x +Faster than alternatives +### 500x +Cheaper than traditional scraping services +Benchmarks displaying performance between Spider API request modes. +Spider API Request Modes ·Benchmarked tailwindcss.com ·06/16/2024 +[See framework benchmarks](https://github.com/spider-rs/spider/blob/main/benches/BENCHMARKS.md) +### Seamless Integrations +Seamlessly integrate Spider with a wide range of platforms, ensuring data curation +perfectly aligned with your requirements. Compatible with all major AI tools. +[LangChain integration](https://python.langchain.com/docs/integrations/document_loaders/spider)[LlamaIndex integrationLlama Index Logo](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-spider-reader)[CrewAI integrationCrewAI Logo](https://docs.crewai.com/tools/SpiderTool/)[FlowWiseAI integrationFlowiseAI LogoFlowiseAI](https://docs.flowiseai.com/integrations/langchain/document-loaders/spider-web-scraper-crawler)[Composio integrationComposio Logo](https://docs.composio.dev/introduction/foundations/components/list_local_tools#spider-crawler)[PhiData integrationPhiData Logo](https://docs.phidata.com/tools/spider) +### Concurrent Streaming +Save time and money without having to worry about bandwidth concerns by effectively +streaming all the results concurrently. The latency cost that is saved becomes drastic as +you crawl more websites. +### Warp Speed +Powered by the cutting-edge[Spider](https://github.com/spider-rs/spider)open-source project, our robust Rust engine scales effortlessly to handle extreme +workloads. We ensure continuous maintenance and improvement for top-tier performance. +## Kickstart Your Data Collecting Projects Today +Jumpstart web crawling with full elastic scaling concurrency, optimal formats, and AI scraping. +### Performance Tuned +Spider is written in Rust and runs in full concurrency to achieve crawling thousands of +pages in secs. +### Multiple response formats +Get clean and formatted markdown, HTML, or text content for fine-tuning or training AI +models. +### Caching +Further boost speed by caching repeated web page crawls to minimize expenses while +building. +### Smart Mode +Spider dynamically switches to Headless Chrome when it needs to quick. +Beta +### Scrape with AI +Do custom browser scripting and data extraction using the latest AI models with no cost +step caching. +### The crawler for LLMs +Don't let crawling and scraping be the highest latency in your LLM & AI agent stack. +### Scrape with no headaches +* Auto Proxy rotations +* Agent headers +* Anti-bot detections +* Headless chrome +* Markdown responses +### The Fastest Web Crawler +* Powered by[spider-rs](https://github.com/spider-rs/spider) +* 100,000 pages/seconds +* Unlimited concurrency +* Simple API +* 50,000 RPM +### Do more with AI +* Browser scripting +* Advanced extraction +* Data pipelines +* Ideal for LLMs and AI Agents +* Accurate labeling +## Achieve more with these new API features +Our API is set to stream so you can act in realtime. +![A user interface with a search bar containing the text "Latest sports news," a green "Submit" button, and two icon buttons to display searching and extracting with the service.](/img/search_feature.webp) +### Search +Get access to search engine results from anywhere and easily crawl and transform pages to +LLM-ready markdown. +[Explore SearchRight Arrow](https://spider.cloud/docs/api#search) +![A user interface segment showing three icons representing different stages of data transformation.](/img/transform_feature_example.webp) +### Transform +Convert raw HTML into markdown easily by using this API. Transform thousands of html pages +in seconds. +[Explore TransformRight Arrow](https://spider.cloud/docs/api#transform) +## Join the community +Backed by a network of early advocates, contributors, and supporters. +[GitHub discussions +Chat Icon +](https://github.com/orgs/spider-rs/discussions)[Discord +Chat Icon +](https://discord.spider.cloud) +[ +![iammerrick's avatar](/img/external/iammerrick_twitter.webp) +@iammerrick +Rust based crawler Spider is next level for crawling &scraping sites. So fast. +Their cloud offering is also so easy to use. Good stuff. https://github.com/spider-rs/spider +](https://twitter.com/iammerrick/status/1787873425446572462) +[ +![WilliamEspegren's avatar](/img/external/william_twitter.webp) +@WilliamEspegren +Web crawler built in rust, currently the nr1 performance in the world with crazy resource management Aaaaaaand they have a cloud offer, that’s wayyyy cheaper than any competitor +Name a reason for me to use anything else? +github.com/spider-rs/spid… +](https://twitter.com/WilliamEspegren/status/1789419820821184764) +[ +![gasa's avatar](/img/external/gaza_twitter.webp) +@gasa +@gasathenaper +is the best crawling tool i have used. I had a complicated project where i needed to paste url and get the website whole website data. Spider does it in an instant +](https://x.com/gasathenaper/status/1810612492596383948) +[ +![Ashpreet Bedi's avatar](/img/external/ashpreet_bedi.webp) +@Ashpreet Bedi +@ashpreetbedi +is THE best crawler out there, give it a try +](https://x.com/ashpreetbedi/status/1815512219003572315?s=46&t=37F5QP_8oKqOsNpHSo6VVw) +[ +![Troyusrex's avatar](/img/external/troy_twitter.webp) +@Troyusrex +I found a new tool, Spider-rs, which scrapes significantly faster and handles more scenarios than the basic scraper I built did. Our use of Spider-rs and AWS infrastructure reduced the scraping time from four months to under a week. +](https://medium.com/@troyusrex/inside-my-virtual-college-advisor-a-deep-dive-into-rag-ai-and-agent-technology-84731b2928f7#1326) +[ +![Dify.AI's avatar](/img/external/difyai.webp) +@Dify.AI +🕷️Spider @spider\_rust +can be used as a built-in tool in #Dify Workflow or as an LLM-callable tool in Agent. It allows fast and affordable web scraping and crawling when your AI applications need real-time web data for context. +](https://x.com/dify_ai/status/1818226971056243089) +## FAQ +Frequently asked questions about Spider. +### What is Spider? +Spider is a leading web crawling tool designed for speed and cost-effectiveness, supporting various data formats including LLM-ready markdown. +### Why is my website not crawling? +Your crawl may fail if it requires JavaScript rendering. Try setting your request to 'chrome 'to solve this issue. +### Can you crawl all pages? +Yes, Spider accurately crawls all necessary content without needing a sitemap. +### What formats can Spider convert web data into? +Spider outputs HTML, raw, text, and various markdown formats. It supports`JSON`,`JSONL`,`CSV`, and`XML`for API responses. +### Is Spider suitable for large scraping projects? +Absolutely, Spider is ideal for large-scale data collection and offers a cost-effective dashboard for data management. +### How can I try Spider? +Purchase credits for our cloud system or test the Open Source Spider engine to explore its capabilities. +### Does it respect robots.txt? +Yes, compliance with robots.txt is default, but you can disable this if necessary. +### Unable to get dynamic content? +If you are having trouble getting dynamic pages, try setting the request parameter to ""chrome ""or ""smart.""You may also need to set `disable\_intercept` to allow third-party or external scripts to run. +### Why is my crawl going slow? +If you are experiencing a slow crawl, it is most likely due to the robots.txt file for the website. The robots.txt file may have a crawl delay set, and we respect the delay up to 60 seconds. +### Do you offer a Free Trial? +Yes, you can try out the service before being charged for free at[checkout](https://spider.cloud/credits/new?free-trial=1). +## Comprehensive Data Curation for Everyone +Trusted by leading tech businesses worldwide to deliver accurate and insightful data solutions. +Outer Labs +[Zapier LogoZapier](https://zapier.com/apps/spider/integrations) +Elementus Logo +Super AI Logo +LayerX Logo +Swiss Re +Write Sonic Logo +Alioth Logo +### Next generation data for AI, scale to millions +[Start now](https://spider.cloud/credits/new) +### Company +* [About](https://spider.cloud/about) +* [Privacy](https://spider.cloud/privacy) +* [Terms](https://spider.cloud/eula) +* [FAQ](https://spider.cloud/faq) +### Resources +* [API](https://spider.cloud/docs/api) +* [Docs](https://spider.cloud/docs/overview) +* [Guides](https://spider.cloud/guides) +* [Spider.rs Docs](https://docs.rs/spider/latest/spider/) +### Services +* [Pricing](https://spider.cloud/credits/new) +* [Web Crawling and Scraping](https://spider.cloud/web-crawling-and-scraping) +[All systems normal.](https://spidercloud.statuspage.io/) +[ +Github LogoGitHub +](https://github.com/spider-rs/spider)[ +Discord LogoDiscord +](https://discord.spider.cloud)[ +Twitter LogoTwitter +](https://twitter.com/spider_rust)"#; + +#[test] +#[ignore] +fn test_real_spider() { + let mut html = String::new(); + let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap(); + html_file + .read_to_string(&mut html) + .expect("File must be readable"); + let result = rewrite_html(&html, false); + assert!(result == SPIDER_RESULT_MD); +} + +#[tokio::test] +#[ignore] +async fn test_real_spider_async() { + let mut html = String::new(); + let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap(); + html_file + .read_to_string(&mut html) + .expect("File must be readable"); + let result = rewrite_html_streaming(&html, false).await; + assert!(result == SPIDER_RESULT_MD); +}