1use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes};
2use lazy_static::lazy_static;
3use regex::Regex;
4
5pub mod extended;
7
8#[cfg(feature = "scraper")]
9pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
10
11#[cfg(feature = "rewriter")]
12pub mod rewriter;
13#[cfg(feature = "scraper")]
14pub mod scraper;
15#[cfg(feature = "scraper")]
16pub use scraper::{
17 ignore, parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
18 parse_html_extended,
19};
20
21lazy_static! {
22 static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
24 r"[<>*\\_~]", r" " ]).expect("valid regex set");
27}
28
29#[cfg(feature = "rewriter")]
34pub fn rewrite_html(html: &str, commonmark: bool) -> String {
35 rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
36}
37
38#[cfg(all(feature = "stream", feature = "rewriter"))]
43pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
44 rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
45 .await
46 .unwrap_or_default()
47}
48
49#[cfg(all(feature = "stream", feature = "rewriter"))]
58pub fn rewrite_html_custom_with_url(
59 html: &str,
60 custom: &Option<std::collections::HashSet<String>>,
61 commonmark: bool,
62 url: &Option<url::Url>,
63) -> String {
64 rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
65}
66
67#[cfg(all(feature = "stream", feature = "rewriter"))]
77pub async fn rewrite_html_custom_with_url_and_chunk(
78 html: &str,
79 custom: &Option<std::collections::HashSet<String>>,
80 commonmark: bool,
81 url: &Option<url::Url>,
82 chunk_size: usize,
83) -> String {
84 rewriter::writer::convert_html_to_markdown_send_with_size(
85 html, &custom, commonmark, url, chunk_size,
86 )
87 .await
88 .unwrap_or_default()
89}
90
91#[cfg(all(feature = "stream", feature = "rewriter"))]
100pub async fn rewrite_html_custom_with_url_streaming(
101 html: &str,
102 custom: &Option<std::collections::HashSet<String>>,
103 commonmark: bool,
104 url: &Option<url::Url>,
105) -> String {
106 rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
107 .await
108 .unwrap_or_default()
109}
110
111pub fn clean_markdown(input: &str) -> String {
115 input.sift()
116}
117
118pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
122 input.sift_bytes()
123}
124
125pub fn replace_markdown_chars(input: &str) -> String {
127 use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;
128
129 if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
130 return input.to_string();
131 }
132
133 let mut output = String::new();
134 let mut chars = input.chars().peekable();
135
136 while let Some(ch) = chars.next() {
137 if ch == '&' {
138 let mut entity = String::new();
139 entity.push(ch);
140 while let Some(&next_ch) = chars.peek() {
141 entity.push(next_ch);
142 chars.next();
143 if entity == " " {
144 entity.clear(); break;
146 } else if next_ch == ';' || entity.len() > 6 {
147 output.push_str(&entity);
148 break;
149 }
150 }
151 if !entity.is_empty() {
152 output.push_str(&entity);
153 }
154 } else if "<>*\\_~".contains(ch) {
155 output.push('\\');
156 output.push(ch);
157 } else {
158 output.push(ch);
159 }
160 }
161
162 output
163}