use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
pub mod extended;
#[cfg(feature = "scraper")]
pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
#[cfg(feature = "rewriter")]
pub mod rewriter;
#[cfg(feature = "scraper")]
pub mod scraper;
#[cfg(feature = "scraper")]
pub use scraper::{
ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
parse_html_extended,
};
#[cfg(feature = "scraper")]
lazy_static::lazy_static! {
pub(crate) static ref MARKDOWN_MIDDLE_KEYCHARS: regex::Regex =
regex::Regex::new(r"[<>*\\_~]").expect("valid regex pattern");
}
#[cfg(feature = "rewriter")]
pub fn rewrite_html(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
.await
.unwrap_or_default()
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub fn rewrite_html_custom_with_url(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<url::Url>,
) -> String {
rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub async fn rewrite_html_custom_with_url_and_chunk(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<url::Url>,
chunk_size: usize,
) -> String {
rewriter::writer::convert_html_to_markdown_send_with_size(
html, &custom, commonmark, url, chunk_size,
)
.await
.unwrap_or_default()
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub async fn rewrite_html_custom_with_url_streaming(
html: &str,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<url::Url>,
) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
.await
.unwrap_or_default()
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub use rewriter::writer::StreamConvertError;
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub async fn rewrite_html_stream<S, B, E>(
stream: S,
commonmark: bool,
) -> Result<String, StreamConvertError<E>>
where
S: futures_util::Stream<Item = Result<B, E>> + Unpin,
B: AsRef<[u8]>,
{
rewriter::writer::convert_html_stream_to_markdown(stream, &None, commonmark, &None).await
}
#[cfg(all(feature = "stream", feature = "rewriter"))]
pub async fn rewrite_html_stream_custom_with_url<S, B, E>(
stream: S,
custom: &Option<std::collections::HashSet<String>>,
commonmark: bool,
url: &Option<url::Url>,
) -> Result<String, StreamConvertError<E>>
where
S: futures_util::Stream<Item = Result<B, E>> + Unpin,
B: AsRef<[u8]>,
{
rewriter::writer::convert_html_stream_to_markdown(stream, custom, commonmark, url).await
}
pub fn clean_markdown(input: &str) -> String {
input.sift_preserve_newlines()
}
pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
input.sift_bytes_preserve_newlines()
}
#[inline]
const fn needs_escape(b: u8) -> bool {
matches!(b, b'<' | b'>' | b'*' | b'\\' | b'_' | b'~')
}
#[inline]
const fn is_special_byte(b: u8) -> bool {
needs_escape(b) || b == b'&'
}
#[inline]
pub fn contains_markdown_chars(input: &str) -> bool {
input.as_bytes().iter().any(|&b| is_special_byte(b))
}
#[inline]
fn decode_html_entity(bytes: &[u8]) -> Option<(&'static str, usize)> {
debug_assert_eq!(bytes[0], b'&');
let limit = bytes.len().min(12);
let semi = bytes[1..limit].iter().position(|&b| b == b';')?;
let entity = &bytes[1..semi + 1]; let consumed = semi + 2;
match entity {
b"amp" => Some(("&", consumed)),
b"lt" => Some(("\\<", consumed)),
b"gt" => Some(("\\>", consumed)),
b"quot" => Some(("\"", consumed)),
b"apos" => Some(("'", consumed)),
b"nbsp" => Some(("", consumed)), _ if entity.first() == Some(&b'#') => decode_numeric_entity(entity, consumed),
_ => None,
}
}
#[inline]
fn decode_numeric_entity(entity: &[u8], consumed: usize) -> Option<(&'static str, usize)> {
let (digits, radix) = if entity.get(1) == Some(&b'x') || entity.get(1) == Some(&b'X') {
(&entity[2..], 16)
} else {
(&entity[1..], 10)
};
if digits.is_empty() {
return None;
}
let mut val: u32 = 0;
for &b in digits {
let d = match b {
b'0'..=b'9' => (b - b'0') as u32,
b'a'..=b'f' if radix == 16 => (b - b'a' + 10) as u32,
b'A'..=b'F' if radix == 16 => (b - b'A' + 10) as u32,
_ => return None,
};
val = val.checked_mul(radix)?.checked_add(d)?;
}
match val {
0x26 => Some(("&", consumed)), 0x3C => Some(("\\<", consumed)), 0x3E => Some(("\\>", consumed)), 0x22 => Some(("\"", consumed)), 0x27 => Some(("'", consumed)), 0xA0 => Some(("", consumed)), 0x2014 => Some(("\u{2014}", consumed)), 0x2013 => Some(("\u{2013}", consumed)), 0x2018 => Some(("\u{2018}", consumed)), 0x2019 => Some(("\u{2019}", consumed)), 0x201C => Some(("\u{201c}", consumed)), 0x201D => Some(("\u{201d}", consumed)), _ => None, }
}
#[inline]
pub fn replace_markdown_chars_opt(input: &str) -> Option<String> {
let bytes = input.as_bytes();
let first_special = bytes.iter().position(|&b| is_special_byte(b));
match first_special {
None => None,
Some(first_pos) => {
let mut output = String::with_capacity(input.len() + input.len() / 8);
output.push_str(&input[..first_pos]);
let mut i = first_pos;
while i < bytes.len() {
let b = bytes[i];
if needs_escape(b) {
output.push('\\');
output.push(b as char);
i += 1;
} else if b == b'&' {
if let Some((decoded, len)) = decode_html_entity(&bytes[i..]) {
output.push_str(decoded);
i += len;
} else {
output.push('&');
i += 1;
}
} else {
let segment_start = i;
i += 1;
while i < bytes.len() && !is_special_byte(bytes[i]) {
i += 1;
}
output.push_str(&input[segment_start..i]);
}
}
Some(output)
}
}
}
#[inline]
pub fn replace_markdown_chars(input: &str) -> String {
replace_markdown_chars_opt(input).unwrap_or_else(|| input.to_string())
}