html2md/
lib.rs

1use std::boxed::Box;
2use std::collections::HashMap;
3use std::sync::LazyLock;
4
5use std::ffi::{CStr, CString};
6use std::os::raw::c_char;
7
8use regex::Regex;
9
10use html5ever::driver::ParseOpts;
11use html5ever::parse_document;
12use html5ever::tendril::TendrilSink;
13
14pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
15
16pub mod anchors;
17pub mod codes;
18pub mod common;
19pub mod containers;
20pub mod dummy;
21pub mod headers;
22pub mod iframes;
23pub mod images;
24pub mod lists;
25pub mod paragraphs;
26pub mod quotes;
27pub mod styles;
28pub mod tables;
29
30use crate::anchors::AnchorHandler;
31use crate::codes::CodeHandler;
32use crate::containers::ContainerHandler;
33use crate::dummy::DummyHandler;
34use crate::dummy::HtmlCherryPickHandler;
35use crate::dummy::IdentityHandler;
36use crate::headers::HeaderHandler;
37use crate::iframes::IframeHandler;
38use crate::images::ImgHandler;
39use crate::lists::ListHandler;
40use crate::lists::ListItemHandler;
41use crate::paragraphs::ParagraphHandler;
42use crate::quotes::QuoteHandler;
43use crate::styles::StyleHandler;
44use crate::tables::TableHandler;
45
46static EXCESSIVE_WHITESPACE_PATTERN: LazyLock<Regex> =
47    LazyLock::new(|| Regex::new("\\s{2,}").unwrap()); // for HTML on-the-fly cleanup
48static EMPTY_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?m)^ +$").unwrap()); // for Markdown post-processing
49static EXCESSIVE_NEWLINE_PATTERN: LazyLock<Regex> =
50    LazyLock::new(|| Regex::new("\\n{3,}").unwrap()); // for Markdown post-processing
51static TRAILING_SPACE_PATTERN: LazyLock<Regex> =
52    LazyLock::new(|| Regex::new("(?m)(\\S) $").unwrap()); // for Markdown post-processing
53static LEADING_NEWLINES_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("^\\n+").unwrap()); // for Markdown post-processing
54static LAST_WHITESPACE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("\\s+$").unwrap()); // for Markdown post-processing
55static START_OF_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(^|\\n) *$").unwrap()); // for Markdown escaping
56static MARKDOWN_STARTONLY_KEYCHARS: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r"^(\s*)([=>+\-#])").unwrap()); // for Markdown escaping
58static MARKDOWN_MIDDLE_KEYCHARS: LazyLock<Regex> =
59    LazyLock::new(|| Regex::new(r"[<>*\\_~]").unwrap()); // for Markdown escaping
60
61/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
62/// in order to register custom tag hadler for tags you want.
63///
64/// You can also override standard tag handlers this way
65/// # Arguments
66/// `html` is source HTML as `String`
67/// `custom` is custom tag hadler producers for tags you want, can be empty
68pub fn parse_html_custom(
69    html: &str,
70    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
71) -> String {
72    let dom = parse_document(RcDom::default(), ParseOpts::default())
73        .from_utf8()
74        .read_from(&mut html.as_bytes())
75        .unwrap();
76    let mut result = StructuredPrinter::default();
77    walk(&dom.document, &mut result, custom);
78
79    clean_markdown(&result.data)
80}
81
82/// Main function of this library. Parses incoming HTML, converts it into Markdown
83/// and returns converted string.
84/// # Arguments
85/// `html` is source HTML as `String`
86pub fn parse_html(html: &str) -> String {
87    parse_html_custom(html, &HashMap::default())
88}
89
90/// Same as `parse_html` but retains all "span" html elements intact
91/// Markdown parsers usually strip them down when rendering but they
92/// may be useful for later processing
93pub fn parse_html_extended(html: &str) -> String {
94    struct SpanAsIsTagFactory;
95    impl TagHandlerFactory for SpanAsIsTagFactory {
96        fn instantiate(&self) -> Box<dyn TagHandler> {
97            Box::new(HtmlCherryPickHandler::default())
98        }
99    }
100
101    let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
102    tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
103    parse_html_custom(html, &tag_factory)
104}
105
106/// Recursively walk through all DOM tree and handle all elements according to
107/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules.
108///
109/// # Arguments
110/// `input` is DOM tree or its subtree
111/// `result` is output holder with position and context tracking
112/// `custom` is custom tag hadler producers for tags you want, can be empty
113fn walk(
114    input: &Handle,
115    result: &mut StructuredPrinter,
116    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
117) {
118    let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
119    let mut tag_name = String::default();
120    match input.data {
121        NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
122        NodeData::Text { ref contents } => {
123            let mut text = contents.borrow().to_string();
124            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
125            if inside_pre {
126                // this is preformatted text, insert as-is
127                result.append_str(&text);
128            } else if !(text.trim().is_empty()
129                && (result.data.ends_with('\n') || result.data.ends_with(' ')))
130            {
131                // in case it's not just a whitespace after the newline or another whitespace
132
133                // regular text, collapse whitespace and newlines in text
134                let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
135                if !inside_code {
136                    text = escape_markdown(result, &text);
137                }
138                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
139                let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
140                result.append_str(minified_text);
141            }
142        }
143        NodeData::Comment { .. } => {} // ignore comments
144        NodeData::Element { ref name, .. } => {
145            tag_name = name.local.to_string();
146            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
147            if inside_pre {
148                // don't add any html tags inside the pre section
149                handler = Box::new(DummyHandler);
150            } else if custom.contains_key(&tag_name) {
151                // have user-supplied factory, instantiate a handler for this tag
152                let factory = custom.get(&tag_name).unwrap();
153                handler = factory.instantiate();
154            } else {
155                // no user-supplied factory, take one of built-in ones
156                handler = match tag_name.as_ref() {
157                    // containers
158                    "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
159                    // pagination, breaks
160                    "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
161                    "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
162                    // spoiler tag
163                    "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
164                    // formatting
165                    "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
166                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
167                    "pre" | "code" => Box::new(CodeHandler::default()),
168                    // images, links
169                    "img" => Box::new(ImgHandler::default()),
170                    "a" => Box::new(AnchorHandler::default()),
171                    // lists
172                    "ol" | "ul" | "menu" => Box::new(ListHandler),
173                    "li" => Box::new(ListItemHandler::default()),
174                    // as-is
175                    "sub" | "sup" => Box::new(IdentityHandler),
176                    // tables, handled fully internally as markdown can't have nested content in tables
177                    // supports only single tables as of now
178                    "table" => Box::new(TableHandler),
179                    "iframe" => Box::new(IframeHandler),
180                    // other
181                    "html" | "head" | "body" => Box::new(DummyHandler),
182                    _ => Box::new(DummyHandler),
183                };
184            }
185        }
186    }
187
188    // handle this tag, while it's not in parent chain
189    // and doesn't have child siblings
190    handler.handle(input, result);
191
192    // save this tag name as parent for child nodes
193    result.parent_chain.push(tag_name.to_string()); // e.g. it was ["body"] and now it's ["body", "p"]
194    let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2
195
196    // create space for siblings of next level
197    result.siblings.insert(current_depth, vec![]);
198
199    for child in input.children.borrow().iter() {
200        if handler.skip_descendants() {
201            continue;
202        }
203
204        walk(child, result, custom);
205
206        if let NodeData::Element { ref name, .. } = child.data {
207            result
208                .siblings
209                .get_mut(&current_depth)
210                .unwrap()
211                .push(name.local.to_string())
212        };
213    }
214
215    // clear siblings of next level
216    result.siblings.remove(&current_depth);
217
218    // release parent tag
219    result.parent_chain.pop();
220
221    // finish handling of tag - parent chain now doesn't contain this tag itself again
222    handler.after_handle(result);
223}
224
225/// This conversion should only be applied to text tags
226///
227/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
228/// like list start or bold text style
229fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
230    // always escape bold/italic/strikethrough
231    let mut data = MARKDOWN_MIDDLE_KEYCHARS
232        .replace_all(text, "\\$0")
233        .to_string();
234
235    // if we're at the start of the line we need to escape list- and quote-starting sequences
236    if START_OF_LINE_PATTERN.is_match(&result.data) {
237        data = MARKDOWN_STARTONLY_KEYCHARS
238            .replace(&data, "$1\\$2")
239            .to_string();
240    }
241
242    // no handling of more complicated cases such as
243    // ![] or []() ones, for now this will suffice
244    data
245}
246
247/// Called after all processing has been finished
248///
249/// Clears excessive punctuation that would be trimmed by renderer anyway
250fn clean_markdown(text: &str) -> String {
251    // remove redundant newlines
252    let intermediate = EMPTY_LINE_PATTERN.replace_all(text, ""); // empty line with trailing spaces, replace with just newline
253    let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n"); // > 3 newlines - not handled by markdown anyway
254    let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1"); // trim space if it's just one
255    let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, ""); // trim leading newlines
256    let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, ""); // trim last newlines
257
258    intermediate.into_owned()
259}
260
261/// Intermediate result of HTML -> Markdown conversion.
262///
263/// Holds context in the form of parent tags and siblings chain
264/// and resulting string of markup content with current position.
265#[derive(Debug, Default)]
266pub struct StructuredPrinter {
267    /// Chain of parents leading to upmost <html> tag
268    pub parent_chain: Vec<String>,
269
270    /// Siblings of currently processed tag in order where they're appearing in html
271    pub siblings: HashMap<usize, Vec<String>>,
272
273    /// resulting markdown document
274    pub data: String,
275}
276
277impl StructuredPrinter {
278    /// Inserts newline
279    pub fn insert_newline(&mut self) {
280        self.append_str("\n");
281    }
282
283    /// Append string to the end of the printer
284    pub fn append_str(&mut self, it: &str) {
285        self.data.push_str(it);
286    }
287
288    /// Insert string at specified position of printer, adjust position to the end of inserted string
289    pub fn insert_str(&mut self, pos: usize, it: &str) {
290        self.data.insert_str(pos, it);
291    }
292}
293
294/// Tag handler factory. This class is required in providing proper
295/// custom tag parsing capabilities to users of this library.
296///
297/// The problem with directly providing tag handlers is that they're not stateless.
298/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc.
299/// The only way to create fresh tag handler for each tag is to provide a factory like this one.
300///
301pub trait TagHandlerFactory {
302    fn instantiate(&self) -> Box<dyn TagHandler>;
303}
304
305/// Trait interface describing abstract handler of arbitrary HTML tag.
306pub trait TagHandler {
307    /// Handle tag encountered when walking HTML tree.
308    /// This is executed before the children processing
309    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
310
311    /// Executed after all children of this tag have been processed
312    fn after_handle(&mut self, printer: &mut StructuredPrinter);
313
314    fn skip_descendants(&self) -> bool {
315        false
316    }
317}
318
319/// FFI variant for HTML -> Markdown conversion for calling from other languages
320#[no_mangle]
321#[allow(clippy::not_unsafe_ptr_arg_deref)]
322pub extern "C" fn parse(html: *const c_char) -> *const c_char {
323    let in_html = unsafe { CStr::from_ptr(html) };
324    let out_md = parse_html(&in_html.to_string_lossy());
325
326    CString::new(out_md).unwrap().into_raw()
327}
328
329/// Expose the JNI interface for android below
330#[cfg(target_os = "android")]
331#[allow(non_snake_case)]
332pub mod android {
333    extern crate jni;
334
335    use super::parse_html;
336    use super::parse_html_extended;
337
338    use self::jni::objects::{JClass, JString};
339    use self::jni::sys::jstring;
340    use self::jni::JNIEnv;
341
342    #[no_mangle]
343    pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parse(
344        env: JNIEnv,
345        _clazz: JClass,
346        html: JString,
347    ) -> jstring {
348        let html_java: String = env
349            .get_string(html)
350            .expect("Couldn't get java string!")
351            .into();
352        let markdown = parse_html(&html_java);
353        let output = env
354            .new_string(markdown)
355            .expect("Couldn't create java string!");
356        output.into_inner()
357    }
358
359    #[no_mangle]
360    pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(
361        env: JNIEnv,
362        _clazz: JClass,
363        html: JString,
364    ) -> jstring {
365        let html_java: String = env
366            .get_string(html)
367            .expect("Couldn't get java string!")
368            .into();
369        let markdown = parse_html_extended(&html_java);
370        let output = env
371            .new_string(markdown)
372            .expect("Couldn't create java string!");
373        output.into_inner()
374    }
375}