1use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
2use lazy_static::lazy_static;
3use regex::Regex;
4
5pub mod extended;
7
8#[cfg(feature = "scraper")]
9pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
10
11#[cfg(feature = "rewriter")]
12pub mod rewriter;
13#[cfg(feature = "scraper")]
14pub mod scraper;
15#[cfg(feature = "scraper")]
16pub use scraper::{
17    ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
18    parse_html_extended,
19};
20
21lazy_static! {
22    static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
24        r"[<>*\\_~]",  r" "      ]).expect("valid regex set");
27}
28
29#[cfg(feature = "rewriter")]
34pub fn rewrite_html(html: &str, commonmark: bool) -> String {
35    rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
36}
37
38#[cfg(all(feature = "stream", feature = "rewriter"))]
43pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
44    rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
45        .await
46        .unwrap_or_default()
47}
48
49#[cfg(all(feature = "stream", feature = "rewriter"))]
58pub fn rewrite_html_custom_with_url(
59    html: &str,
60    custom: &Option<std::collections::HashSet<String>>,
61    commonmark: bool,
62    url: &Option<url::Url>,
63) -> String {
64    rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
65}
66
67#[cfg(all(feature = "stream", feature = "rewriter"))]
77pub async fn rewrite_html_custom_with_url_and_chunk(
78    html: &str,
79    custom: &Option<std::collections::HashSet<String>>,
80    commonmark: bool,
81    url: &Option<url::Url>,
82    chunk_size: usize,
83) -> String {
84    rewriter::writer::convert_html_to_markdown_send_with_size(
85        html, &custom, commonmark, url, chunk_size,
86    )
87    .await
88    .unwrap_or_default()
89}
90
91#[cfg(all(feature = "stream", feature = "rewriter"))]
100pub async fn rewrite_html_custom_with_url_streaming(
101    html: &str,
102    custom: &Option<std::collections::HashSet<String>>,
103    commonmark: bool,
104    url: &Option<url::Url>,
105) -> String {
106    rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
107        .await
108        .unwrap_or_default()
109}
110
111pub fn clean_markdown(input: &str) -> String {
115    input.sift()
116}
117
118pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
122    input.sift_bytes()
123}
124
125pub fn replace_markdown_chars(input: &str) -> String {
127    use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;
128
129    if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
130        return input.to_string();
131    }
132
133    let mut output = String::new();
134    let mut chars = input.chars().peekable();
135
136    while let Some(ch) = chars.next() {
137        if ch == '&' {
138            let mut entity = String::new();
139            entity.push(ch);
140            while let Some(&next_ch) = chars.peek() {
141                entity.push(next_ch);
142                chars.next();
143                if entity == " " {
144                    entity.clear(); break;
146                } else if next_ch == ';' || entity.len() > 6 {
147                    output.push_str(&entity);
148                    break;
149                }
150            }
151            if !entity.is_empty() {
152                output.push_str(&entity);
153            }
154        } else if "<>*\\_~".contains(ch) {
155            output.push('\\');
156            output.push(ch);
157        } else {
158            output.push(ch);
159        }
160    }
161
162    output
163}