html2md/
lib.rs

1use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
2use lazy_static::lazy_static;
3use regex::Regex;
4
5// we want to just use the rewriter instead for v0.1.
6pub mod extended;
7
8#[cfg(feature = "scraper")]
9pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
10
11#[cfg(feature = "rewriter")]
12pub mod rewriter;
13#[cfg(feature = "scraper")]
14pub mod scraper;
15#[cfg(feature = "scraper")]
16pub use scraper::{
17    ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
18    parse_html_extended,
19};
20
21lazy_static! {
22    static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
23    static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
24        r"[<>*\\_~]",  // Matches any single markdown character
25        r"&nbsp;"      // Matches the entire "&nbsp;" string
26    ]).expect("valid regex set");
27}
28
29/// Main function of this library to come. Rewrites incoming HTML, converts it into Markdown
30/// and returns converted string. Incomplete work in progress for major performance increases.
31/// # Arguments
32/// `html` is source HTML as `String`
33#[cfg(feature = "rewriter")]
34pub fn rewrite_html(html: &str, commonmark: bool) -> String {
35    rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
36}
37
38/// Main function of this library async streaming. Rewrites incoming HTML, converts it into Markdown
39/// and returns converted string. Incomplete work in progress for major performance increases.
40/// # Arguments
41/// `html` is source HTML as `String`
42#[cfg(all(feature = "stream", feature = "rewriter"))]
43pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
44    rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
45        .await
46        .unwrap_or_default()
47}
48
49/// Custom variant of rewrite function.
50///
51/// You can also override standard tag handlers this way
52/// # Arguments
53/// `html` is source HTML as `String`
54/// `custom` is custom tag hadler producers for tags you want, can be empty
55/// `commonmark` is for adjusting markdown output to commonmark
56/// `url` is used to provide absolute url handling
57#[cfg(all(feature = "stream", feature = "rewriter"))]
58pub fn rewrite_html_custom_with_url(
59    html: &str,
60    custom: &Option<std::collections::HashSet<String>>,
61    commonmark: bool,
62    url: &Option<url::Url>,
63) -> String {
64    rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
65}
66
67/// Custom variant of rewrite function.
68///
69/// You can also override standard tag handlers this way
70/// # Arguments
71/// `html` is source HTML as `String`
72/// `custom` is custom tag hadler producers for tags you want, can be empty
73/// `commonmark` is for adjusting markdown output to commonmark
74/// `url` is used to provide absolute url handling
75/// `chunk_size` the chunk size to use.
76#[cfg(all(feature = "stream", feature = "rewriter"))]
77pub async fn rewrite_html_custom_with_url_and_chunk(
78    html: &str,
79    custom: &Option<std::collections::HashSet<String>>,
80    commonmark: bool,
81    url: &Option<url::Url>,
82    chunk_size: usize,
83) -> String {
84    rewriter::writer::convert_html_to_markdown_send_with_size(
85        html, &custom, commonmark, url, chunk_size,
86    )
87    .await
88    .unwrap_or_default()
89}
90
91/// Custom variant of rewrite function streaming async.
92///
93/// You can also override standard tag handlers this way
94/// # Arguments
95/// `html` is source HTML as `String`
96/// `custom` is custom tag hadler producers for tags you want, can be empty
97/// `commonmark` is for adjusting markdown output to commonmark
98/// `url` is used to provide absolute url handling
99#[cfg(all(feature = "stream", feature = "rewriter"))]
100pub async fn rewrite_html_custom_with_url_streaming(
101    html: &str,
102    custom: &Option<std::collections::HashSet<String>>,
103    commonmark: bool,
104    url: &Option<url::Url>,
105) -> String {
106    rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
107        .await
108        .unwrap_or_default()
109}
110
111/// Called after all processing has been finished
112///
113/// Clears excessive punctuation that would be trimmed by renderer anyway
114pub fn clean_markdown(input: &str) -> String {
115    input.sift()
116}
117
118/// Called after all processing has been finished
119///
120/// Clears excessive punctuation that would be trimmed by renderer anyway
121pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
122    input.sift_bytes()
123}
124
125/// Replace the markdown chars cleanly.
126pub fn replace_markdown_chars(input: &str) -> String {
127    use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;
128
129    if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
130        return input.to_string();
131    }
132
133    let mut output = String::new();
134    let mut chars = input.chars().peekable();
135
136    while let Some(ch) = chars.next() {
137        if ch == '&' {
138            let mut entity = String::new();
139            entity.push(ch);
140            while let Some(&next_ch) = chars.peek() {
141                entity.push(next_ch);
142                chars.next();
143                if entity == "&nbsp;" {
144                    entity.clear(); // discard &nbsp;
145                    break;
146                } else if next_ch == ';' || entity.len() > 6 {
147                    output.push_str(&entity);
148                    break;
149                }
150            }
151            if !entity.is_empty() {
152                output.push_str(&entity);
153            }
154        } else if "<>*\\_~".contains(ch) {
155            output.push('\\');
156            output.push(ch);
157        } else {
158            output.push(ch);
159        }
160    }
161
162    output
163}