Skip to main content

html2md/
lib.rs

1use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
2
3// we want to just use the rewriter instead for v0.1.
4pub mod extended;
5
6#[cfg(feature = "scraper")]
7pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
8
9#[cfg(feature = "rewriter")]
10pub mod rewriter;
11#[cfg(feature = "scraper")]
12pub mod scraper;
13#[cfg(feature = "scraper")]
14pub use scraper::{
15    ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
16    parse_html_extended,
17};
18
19// Regex patterns only needed for the scraper feature
20#[cfg(feature = "scraper")]
21lazy_static::lazy_static! {
22    pub(crate) static ref MARKDOWN_MIDDLE_KEYCHARS: regex::Regex =
23        regex::Regex::new(r"[<>*\\_~]").expect("valid regex pattern");
24}
25
26/// Main function of this library to come. Rewrites incoming HTML, converts it into Markdown
27/// and returns converted string. Incomplete work in progress for major performance increases.
28/// # Arguments
29/// `html` is source HTML as `String`
30#[cfg(feature = "rewriter")]
31pub fn rewrite_html(html: &str, commonmark: bool) -> String {
32    rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
33}
34
35/// Main function of this library async streaming. Rewrites incoming HTML, converts it into Markdown
36/// and returns converted string. Incomplete work in progress for major performance increases.
37/// # Arguments
38/// `html` is source HTML as `String`
39#[cfg(all(feature = "stream", feature = "rewriter"))]
40pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
41    rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
42        .await
43        .unwrap_or_default()
44}
45
46/// Custom variant of rewrite function.
47///
48/// You can also override standard tag handlers this way
49/// # Arguments
50/// `html` is source HTML as `String`
51/// `custom` is custom tag hadler producers for tags you want, can be empty
52/// `commonmark` is for adjusting markdown output to commonmark
53/// `url` is used to provide absolute url handling
54#[cfg(all(feature = "stream", feature = "rewriter"))]
55pub fn rewrite_html_custom_with_url(
56    html: &str,
57    custom: &Option<std::collections::HashSet<String>>,
58    commonmark: bool,
59    url: &Option<url::Url>,
60) -> String {
61    rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
62}
63
64/// Custom variant of rewrite function.
65///
66/// You can also override standard tag handlers this way
67/// # Arguments
68/// `html` is source HTML as `String`
69/// `custom` is custom tag hadler producers for tags you want, can be empty
70/// `commonmark` is for adjusting markdown output to commonmark
71/// `url` is used to provide absolute url handling
72/// `chunk_size` the chunk size to use.
73#[cfg(all(feature = "stream", feature = "rewriter"))]
74pub async fn rewrite_html_custom_with_url_and_chunk(
75    html: &str,
76    custom: &Option<std::collections::HashSet<String>>,
77    commonmark: bool,
78    url: &Option<url::Url>,
79    chunk_size: usize,
80) -> String {
81    rewriter::writer::convert_html_to_markdown_send_with_size(
82        html, &custom, commonmark, url, chunk_size,
83    )
84    .await
85    .unwrap_or_default()
86}
87
88/// Custom variant of rewrite function streaming async.
89///
90/// You can also override standard tag handlers this way
91/// # Arguments
92/// `html` is source HTML as `String`
93/// `custom` is custom tag hadler producers for tags you want, can be empty
94/// `commonmark` is for adjusting markdown output to commonmark
95/// `url` is used to provide absolute url handling
96#[cfg(all(feature = "stream", feature = "rewriter"))]
97pub async fn rewrite_html_custom_with_url_streaming(
98    html: &str,
99    custom: &Option<std::collections::HashSet<String>>,
100    commonmark: bool,
101    url: &Option<url::Url>,
102) -> String {
103    rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
104        .await
105        .unwrap_or_default()
106}
107
108/// Called after all processing has been finished
109///
110/// Clears excessive punctuation that would be trimmed by renderer anyway
111pub fn clean_markdown(input: &str) -> String {
112    input.sift_preserve_newlines()
113}
114
115/// Called after all processing has been finished
116///
117/// Clears excessive punctuation that would be trimmed by renderer anyway
118pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
119    input.sift_bytes_preserve_newlines()
120}
121
122/// Check if a byte needs markdown escaping.
123#[inline]
124const fn needs_escape(b: u8) -> bool {
125    matches!(b, b'<' | b'>' | b'*' | b'\\' | b'_' | b'~')
126}
127
128/// Check if byte could start a special sequence (escape char or &nbsp;).
129#[inline]
130const fn is_special_byte(b: u8) -> bool {
131    needs_escape(b) || b == b'&'
132}
133
134/// Check if a string contains any characters that need markdown escaping.
135/// Use this to avoid allocation when no escaping is needed.
136#[inline]
137pub fn contains_markdown_chars(input: &str) -> bool {
138    input.as_bytes().iter().any(|&b| is_special_byte(b))
139}
140
141/// Replace the markdown chars cleanly.
142/// Optimized to scan bytes and process in bulk segments.
143/// Returns None if no changes needed (avoids allocation).
144#[inline]
145pub fn replace_markdown_chars_opt(input: &str) -> Option<String> {
146    let bytes = input.as_bytes();
147
148    // Fast path: scan for any special character
149    let first_special = bytes.iter().position(|&b| is_special_byte(b));
150
151    match first_special {
152        None => None,
153        Some(first_pos) => {
154            // Pre-allocate with some headroom for escapes
155            let mut output = String::with_capacity(input.len() + input.len() / 8);
156
157            // Copy everything before first special char
158            output.push_str(&input[..first_pos]);
159
160            // Process the rest byte-by-byte from first_pos
161            let mut i = first_pos;
162            while i < bytes.len() {
163                let b = bytes[i];
164
165                if needs_escape(b) {
166                    output.push('\\');
167                    output.push(b as char);
168                    i += 1;
169                } else if b == b'&' {
170                    // Check for &nbsp; (6 bytes)
171                    if i + 5 < bytes.len() && &bytes[i..i + 6] == b"&nbsp;" {
172                        // Skip &nbsp; entirely
173                        i += 6;
174                    } else {
175                        output.push('&');
176                        i += 1;
177                    }
178                } else {
179                    // Find the next special character and copy the segment
180                    let segment_start = i;
181                    i += 1;
182                    while i < bytes.len() && !is_special_byte(bytes[i]) {
183                        i += 1;
184                    }
185                    output.push_str(&input[segment_start..i]);
186                }
187            }
188
189            Some(output)
190        }
191    }
192}
193
194/// Replace the markdown chars cleanly.
195/// Optimized to scan bytes and process in bulk segments.
196#[inline]
197pub fn replace_markdown_chars(input: &str) -> String {
198    replace_markdown_chars_opt(input).unwrap_or_else(|| input.to_string())
199}