1use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
2
3pub mod extended;
5
6#[cfg(feature = "scraper")]
7pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
8
9#[cfg(feature = "rewriter")]
10pub mod rewriter;
11#[cfg(feature = "scraper")]
12pub mod scraper;
13#[cfg(feature = "scraper")]
14pub use scraper::{
15 ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
16 parse_html_extended,
17};
18
19#[cfg(feature = "scraper")]
21lazy_static::lazy_static! {
22 pub(crate) static ref MARKDOWN_MIDDLE_KEYCHARS: regex::Regex =
23 regex::Regex::new(r"[<>*\\_~]").expect("valid regex pattern");
24}
25
26#[cfg(feature = "rewriter")]
31pub fn rewrite_html(html: &str, commonmark: bool) -> String {
32 rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
33}
34
35#[cfg(all(feature = "stream", feature = "rewriter"))]
40pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
41 rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
42 .await
43 .unwrap_or_default()
44}
45
46#[cfg(all(feature = "stream", feature = "rewriter"))]
55pub fn rewrite_html_custom_with_url(
56 html: &str,
57 custom: &Option<std::collections::HashSet<String>>,
58 commonmark: bool,
59 url: &Option<url::Url>,
60) -> String {
61 rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
62}
63
64#[cfg(all(feature = "stream", feature = "rewriter"))]
74pub async fn rewrite_html_custom_with_url_and_chunk(
75 html: &str,
76 custom: &Option<std::collections::HashSet<String>>,
77 commonmark: bool,
78 url: &Option<url::Url>,
79 chunk_size: usize,
80) -> String {
81 rewriter::writer::convert_html_to_markdown_send_with_size(
82 html, &custom, commonmark, url, chunk_size,
83 )
84 .await
85 .unwrap_or_default()
86}
87
88#[cfg(all(feature = "stream", feature = "rewriter"))]
97pub async fn rewrite_html_custom_with_url_streaming(
98 html: &str,
99 custom: &Option<std::collections::HashSet<String>>,
100 commonmark: bool,
101 url: &Option<url::Url>,
102) -> String {
103 rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
104 .await
105 .unwrap_or_default()
106}
107
108#[cfg(all(feature = "stream", feature = "rewriter"))]
110pub use rewriter::writer::StreamConvertError;
111
112#[cfg(all(feature = "stream", feature = "rewriter"))]
122pub async fn rewrite_html_stream<S, B, E>(
123 stream: S,
124 commonmark: bool,
125) -> Result<String, StreamConvertError<E>>
126where
127 S: futures_util::Stream<Item = Result<B, E>> + Unpin,
128 B: AsRef<[u8]>,
129{
130 rewriter::writer::convert_html_stream_to_markdown(stream, &None, commonmark, &None).await
131}
132
133#[cfg(all(feature = "stream", feature = "rewriter"))]
141pub async fn rewrite_html_stream_custom_with_url<S, B, E>(
142 stream: S,
143 custom: &Option<std::collections::HashSet<String>>,
144 commonmark: bool,
145 url: &Option<url::Url>,
146) -> Result<String, StreamConvertError<E>>
147where
148 S: futures_util::Stream<Item = Result<B, E>> + Unpin,
149 B: AsRef<[u8]>,
150{
151 rewriter::writer::convert_html_stream_to_markdown(stream, custom, commonmark, url).await
152}
153
154pub fn clean_markdown(input: &str) -> String {
158 input.sift_preserve_newlines()
159}
160
161pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
165 input.sift_bytes_preserve_newlines()
166}
167
168#[inline]
170const fn needs_escape(b: u8) -> bool {
171 matches!(b, b'<' | b'>' | b'*' | b'\\' | b'_' | b'~')
172}
173
174#[inline]
176const fn is_special_byte(b: u8) -> bool {
177 needs_escape(b) || b == b'&'
178}
179
180#[inline]
183pub fn contains_markdown_chars(input: &str) -> bool {
184 input.as_bytes().iter().any(|&b| is_special_byte(b))
185}
186
187#[inline]
191fn decode_html_entity(bytes: &[u8]) -> Option<(&'static str, usize)> {
192 debug_assert_eq!(bytes[0], b'&');
193
194 let limit = bytes.len().min(12);
196 let semi = bytes[1..limit].iter().position(|&b| b == b';')?;
197 let entity = &bytes[1..semi + 1]; let consumed = semi + 2; match entity {
201 b"amp" => Some(("&", consumed)),
202 b"lt" => Some(("\\<", consumed)),
203 b"gt" => Some(("\\>", consumed)),
204 b"quot" => Some(("\"", consumed)),
205 b"apos" => Some(("'", consumed)),
206 b"nbsp" => Some(("", consumed)), _ if entity.first() == Some(&b'#') => decode_numeric_entity(entity, consumed),
208 _ => None,
209 }
210}
211
212#[inline]
214fn decode_numeric_entity(entity: &[u8], consumed: usize) -> Option<(&'static str, usize)> {
215 let (digits, radix) = if entity.get(1) == Some(&b'x') || entity.get(1) == Some(&b'X') {
216 (&entity[2..], 16)
217 } else {
218 (&entity[1..], 10)
219 };
220
221 if digits.is_empty() {
222 return None;
223 }
224
225 let mut val: u32 = 0;
227 for &b in digits {
228 let d = match b {
229 b'0'..=b'9' => (b - b'0') as u32,
230 b'a'..=b'f' if radix == 16 => (b - b'a' + 10) as u32,
231 b'A'..=b'F' if radix == 16 => (b - b'A' + 10) as u32,
232 _ => return None,
233 };
234 val = val.checked_mul(radix)?.checked_add(d)?;
235 }
236
237 match val {
239 0x26 => Some(("&", consumed)), 0x3C => Some(("\\<", consumed)), 0x3E => Some(("\\>", consumed)), 0x22 => Some(("\"", consumed)), 0x27 => Some(("'", consumed)), 0xA0 => Some(("", consumed)), 0x2014 => Some(("\u{2014}", consumed)), 0x2013 => Some(("\u{2013}", consumed)), 0x2018 => Some(("\u{2018}", consumed)), 0x2019 => Some(("\u{2019}", consumed)), 0x201C => Some(("\u{201c}", consumed)), 0x201D => Some(("\u{201d}", consumed)), _ => None, }
253}
254
255#[inline]
259pub fn replace_markdown_chars_opt(input: &str) -> Option<String> {
260 let bytes = input.as_bytes();
261
262 let first_special = bytes.iter().position(|&b| is_special_byte(b));
264
265 match first_special {
266 None => None,
267 Some(first_pos) => {
268 let mut output = String::with_capacity(input.len() + input.len() / 8);
270
271 output.push_str(&input[..first_pos]);
273
274 let mut i = first_pos;
276 while i < bytes.len() {
277 let b = bytes[i];
278
279 if needs_escape(b) {
280 output.push('\\');
281 output.push(b as char);
282 i += 1;
283 } else if b == b'&' {
284 if let Some((decoded, len)) = decode_html_entity(&bytes[i..]) {
286 output.push_str(decoded);
287 i += len;
288 } else {
289 output.push('&');
290 i += 1;
291 }
292 } else {
293 let segment_start = i;
295 i += 1;
296 while i < bytes.len() && !is_special_byte(bytes[i]) {
297 i += 1;
298 }
299 output.push_str(&input[segment_start..i]);
300 }
301 }
302
303 Some(output)
304 }
305 }
306}
307
308#[inline]
311pub fn replace_markdown_chars(input: &str) -> String {
312 replace_markdown_chars_opt(input).unwrap_or_else(|| input.to_string())
313}