use super::handle::handle_tag;
use super::quotes::rewrite_blockquote_text;
use crate::clean_markdown_bytes;
use crate::rewriter::{handle::handle_tag_send, quotes::rewrite_blockquote_text_send};
use lol_html::{doc_comments, doctype, element, html_content::EndTag, text, RewriteStrSettings};
use std::cell::Cell;
use std::rc::Rc;
use std::sync::{
atomic::{AtomicU8, AtomicUsize, Ordering},
Arc,
};
use url::Url;
lazy_static::lazy_static! {
#[cfg(feature = "ignore_cookies")]
static ref COOKIE_BANNER_SELECTOR: &'static str =
"body > #onetrust-banner-sdk,#didomi-host,#qc-cmp2-container,#cookie-banner,#__rptl-cookiebanner";
}
type EndHandler = Box<
dyn for<'b> FnOnce(
&mut EndTag<'b>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync + 'static>>
+ Send
+ 'static,
>;
type LocalEndHandler = Box<
dyn for<'b> FnOnce(
&mut EndTag<'b>,
) -> Result<(), Box<dyn std::error::Error + Send + Sync + 'static>>
+ 'static,
>;
#[inline]
fn is_ascii_ws_only(s: &str) -> bool {
s.as_bytes()
.iter()
.all(|&b| matches!(b, b' ' | b'\n' | b'\r' | b'\t' | 0x0C))
}
fn estimate_markdown(html: &str) -> usize {
if html.is_empty() {
0
} else {
(html.len() / 2).max(50)
}
}
const F_IN_TABLE: u8 = 1 << 0;
const F_LI_START: u8 = 1 << 1;
#[inline]
fn flag_set(flags: &AtomicU8, mask: u8) {
let _ = flags.fetch_or(mask, Ordering::Relaxed);
}
#[inline]
fn flag_clear(flags: &AtomicU8, mask: u8) {
let _ = flags.fetch_and(!mask, Ordering::Relaxed);
}
pub fn get_rewriter_settings(
commonmark: bool,
custom: &Option<std::collections::HashSet<String>>,
url: Option<Url>,
) -> RewriteStrSettings<'static, 'static> {
let mut list_type: Option<&'static str> = None;
let mut order_counter = 0usize;
let quote_depth = Rc::new(AtomicUsize::new(0));
let quote_depth1 = quote_depth.clone();
let repaired_head = Rc::new(std::sync::OnceLock::new());
let list_item_start_flag = Rc::new(Cell::new(false));
let in_table_flag = Rc::new(Cell::new(false));
let mut in_table = false;
let mut table_row_start = false;
let mut list_item_start = false;
let mut element_content_handlers = Vec::with_capacity(
4 + custom
.as_ref()
.map_or(0, |c| if c.is_empty() { 0 } else { 1 })
+ {
#[cfg(feature = "ignore_cookies")]
{
1
}
#[cfg(not(feature = "ignore_cookies"))]
{
0
}
},
);
#[cfg(feature = "ignore_cookies")]
{
element_content_handlers.push(lol_html::element!(COOKIE_BANNER_SELECTOR, |el| {
el.remove();
Ok(())
}));
}
element_content_handlers.push(text!("blockquote, q, cite", move |el| {
let _ = rewrite_blockquote_text(el, "e_depth1);
Ok(())
}));
let list_item_start_flag_text = list_item_start_flag.clone();
let in_table_flag_text = in_table_flag.clone();
element_content_handlers.push(text!(
"*:not(script):not(head):not(style):not(svg)",
move |el| {
let s = el.as_str();
if in_table_flag_text.get() && is_ascii_ws_only(s) {
*el.as_mut_str() = String::new();
return Ok(());
}
if list_item_start_flag_text.get() {
if is_ascii_ws_only(s) {
*el.as_mut_str() = String::new();
return Ok(());
}
list_item_start_flag_text.set(false);
}
if let Some(escaped) = crate::replace_markdown_chars_opt(s) {
*el.as_mut_str() = escaped;
}
Ok(())
}
));
element_content_handlers.push(element!(
"head, nav, footer, script, noscript, style",
move |el| {
let repaired_head_element: bool = repaired_head.get().is_some();
let head_element = el.tag_name() == "head";
if head_element && !repaired_head_element {
if let Some(hvec) = el.end_tag_handlers() {
let repaired_head = repaired_head.clone();
let h1: LocalEndHandler =
Box::new(move |end: &mut lol_html::html_content::EndTag<'_>| {
let repaired_element = repaired_head.get().is_some();
if end.name() == "html" && !repaired_element {
let _ = repaired_head.set(true);
end.after("</head>", lol_html::html_content::ContentType::Html);
} else {
end.remove();
}
Ok(())
});
hvec.push(h1);
}
} else {
el.remove();
}
Ok(())
}
));
let list_item_start_flag_el = list_item_start_flag.clone();
let in_table_flag_el = in_table_flag.clone();
element_content_handlers.push(element!("*", move |el| {
if el.tag_name().as_str() == "table" {
in_table_flag_el.set(true);
if let Some(hvec) = el.end_tag_handlers() {
let in_table_flag_end = in_table_flag_el.clone();
let h: LocalEndHandler =
Box::new(move |_end: &mut lol_html::html_content::EndTag<'_>| {
in_table_flag_end.set(false);
Ok(())
});
hvec.push(h);
}
}
in_table = in_table_flag_el.get();
list_item_start = list_item_start_flag_el.get();
let _ = handle_tag(
el,
commonmark,
&url,
&mut list_type,
&mut order_counter,
quote_depth.clone(),
&mut in_table,
&mut table_row_start,
&mut list_item_start,
);
list_item_start_flag_el.set(list_item_start);
Ok(())
}));
if let Some(ignore) = custom {
if !ignore.is_empty() {
let ignore_handler = element!(
ignore.iter().cloned().collect::<Vec<String>>().join(","),
|el| {
el.remove();
Ok(())
}
);
element_content_handlers.push(ignore_handler);
}
}
RewriteStrSettings {
document_content_handlers: vec![
doc_comments!(|c| {
c.remove();
Ok(())
}),
doctype!(|c| {
c.remove();
Ok(())
}),
],
element_content_handlers,
..RewriteStrSettings::default()
}
}
pub fn get_rewriter_settings_send(
commonmark: bool,
custom: &Option<std::collections::HashSet<String>>,
url: Option<Url>,
) -> lol_html::send::Settings<'static, 'static> {
let mut list_type: Option<&'static str> = None;
let mut order_counter = 0usize;
let quote_depth = Arc::new(AtomicUsize::new(0));
let quote_depth1 = quote_depth.clone();
let repaired_head = Arc::new(std::sync::OnceLock::new());
let flags = Arc::new(AtomicU8::new(0));
let mut in_table = false;
let mut table_row_start = false;
let mut list_item_start = false;
let mut element_content_handlers = Vec::with_capacity(
4 + custom
.as_ref()
.map_or(0, |c| if c.is_empty() { 0 } else { 1 })
+ {
#[cfg(feature = "ignore_cookies")]
{
1
}
#[cfg(not(feature = "ignore_cookies"))]
{
0
}
},
);
#[cfg(feature = "ignore_cookies")]
{
element_content_handlers.push(lol_html::element!(COOKIE_BANNER_SELECTOR, |el| {
el.remove();
Ok(())
}));
}
element_content_handlers.push(text!("blockquote, q, cite", move |el| {
let _ = rewrite_blockquote_text_send(el, "e_depth);
Ok(())
}));
let flags_text = flags.clone();
element_content_handlers.push(text!(
"*:not(script):not(head):not(style):not(svg)",
move |el| {
let f = flags_text.load(Ordering::Relaxed);
let in_table_now = (f & F_IN_TABLE) != 0;
let li_start_now = (f & F_LI_START) != 0;
let s = el.as_str();
if in_table_now && is_ascii_ws_only(s) {
*el.as_mut_str() = String::new();
return Ok(());
}
if li_start_now {
if is_ascii_ws_only(s) {
*el.as_mut_str() = String::new();
return Ok(());
}
flag_clear(&*flags_text, F_LI_START);
}
if let Some(escaped) = crate::replace_markdown_chars_opt(s) {
*el.as_mut_str() = escaped;
}
Ok(())
}
));
element_content_handlers.push(element!(
"head, nav, footer, script, noscript, style",
move |el| {
let repaired_head_element: bool = repaired_head.get().is_some();
let head_element = el.tag_name() == "head";
if head_element && !repaired_head_element {
if let Some(hvec) = el.end_tag_handlers() {
let repaired_head = repaired_head.clone();
let h1: EndHandler =
Box::new(move |end: &mut lol_html::html_content::EndTag<'_>| {
let repaired_element = repaired_head.get().is_some();
if end.name() == "html" && !repaired_element {
let _ = repaired_head.set(true);
end.after("</head>", lol_html::html_content::ContentType::Html);
} else {
end.remove();
}
Ok(())
});
hvec.push(h1);
}
} else {
el.remove();
}
Ok(())
}
));
let flags_el = flags.clone();
element_content_handlers.push(element!("*", move |el| {
if el.tag_name().as_str() == "table" {
flag_set(&*flags_el, F_IN_TABLE);
if let Some(hvec) = el.end_tag_handlers() {
let flags_end = flags_el.clone();
let h: EndHandler =
Box::new(move |_end: &mut lol_html::html_content::EndTag<'_>| {
flag_clear(&*flags_end, F_IN_TABLE);
Ok(())
});
hvec.push(h);
}
}
let f = flags_el.load(Ordering::Relaxed);
in_table = (f & F_IN_TABLE) != 0;
list_item_start = (f & F_LI_START) != 0;
let _ = handle_tag_send(
el,
commonmark,
&url,
&mut list_type,
&mut order_counter,
quote_depth1.clone(),
&mut in_table,
&mut table_row_start,
&mut list_item_start,
);
if list_item_start {
flag_set(&*flags_el, F_LI_START);
} else {
flag_clear(&*flags_el, F_LI_START);
}
Ok(())
}));
if let Some(ignore) = custom {
if !ignore.is_empty() {
let ignore_handler = element!(
ignore.iter().cloned().collect::<Vec<String>>().join(","),
|el| {
el.remove();
Ok(())
}
);
element_content_handlers.push(ignore_handler);
}
}
lol_html::send::Settings {
document_content_handlers: vec![
doc_comments!(|c| {
c.remove();
Ok(())
}),
doctype!(|c| {
c.remove();
Ok(())
}),
],
element_content_handlers,
..lol_html::send::Settings::new_send()
}
}
pub(crate) fn rewrite_str<'h, 's, H: lol_html::HandlerTypes>(
html: &str,
settings: impl Into<lol_html::Settings<'h, 's, H>>,
) -> Result<Vec<u8>, lol_html::errors::RewritingError> {
let mut output = Vec::with_capacity(estimate_markdown(html));
let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| {
output.extend_from_slice(c);
});
rewriter.write(html.as_bytes())?;
rewriter.end()?;
Ok(output)
}
pub(crate) fn convert_html_to_markdown(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> Result<String, Box<dyn std::error::Error>> {
let settings = get_rewriter_settings(commonmark, custom, url.clone());
match rewrite_str(html, settings) {
Ok(markdown) => Ok(clean_markdown_bytes(&markdown)),
Err(e) => Err(e.into()),
}
}
#[cfg(feature = "stream")]
pub async fn convert_html_to_markdown_send_with_size(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
chunk_size: usize,
) -> Result<String, Box<dyn std::error::Error>> {
let settings = get_rewriter_settings_send(commonmark, custom, url.clone());
let mut rewrited_bytes: Vec<u8> = Vec::with_capacity(estimate_markdown(html));
let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), |c: &[u8]| {
rewrited_bytes.extend_from_slice(c);
});
let bytes = html.as_bytes();
let mut wrote_error = false;
for chunk in bytes.chunks(chunk_size) {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
}
if !wrote_error {
let _ = rewriter.end();
}
Ok(clean_markdown_bytes(&rewrited_bytes))
}
#[cfg(feature = "stream")]
pub async fn convert_html_to_markdown_send(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> Result<String, Box<dyn std::error::Error>> {
convert_html_to_markdown_send_with_size(html, custom, commonmark, url, 8192).await
}
#[cfg(feature = "stream")]
#[derive(Debug)]
pub enum StreamConvertError<E> {
Stream(E),
Rewrite(lol_html::errors::RewritingError),
}
#[cfg(feature = "stream")]
impl<E: std::fmt::Display> std::fmt::Display for StreamConvertError<E> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Stream(e) => write!(f, "stream error: {e}"),
Self::Rewrite(e) => write!(f, "rewrite error: {e}"),
}
}
}
#[cfg(feature = "stream")]
impl<E: std::error::Error + 'static> std::error::Error for StreamConvertError<E> {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Stream(e) => Some(e),
Self::Rewrite(e) => Some(e),
}
}
}
#[cfg(feature = "stream")]
pub async fn convert_html_stream_to_markdown<S, B, E>(
stream: S,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> Result<String, StreamConvertError<E>>
where
S: futures_util::Stream<Item = Result<B, E>> + Unpin,
B: AsRef<[u8]>,
{
use futures_util::StreamExt;
let settings = get_rewriter_settings_send(commonmark, custom, url.clone());
let mut output: Vec<u8> = Vec::with_capacity(4096);
let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), |c: &[u8]| {
output.extend_from_slice(c);
});
futures_util::pin_mut!(stream);
while let Some(chunk_result) = stream.next().await {
let chunk = chunk_result.map_err(StreamConvertError::Stream)?;
rewriter
.write(chunk.as_ref())
.map_err(StreamConvertError::Rewrite)?;
}
rewriter.end().map_err(StreamConvertError::Rewrite)?;
Ok(clean_markdown_bytes(&output))
}