html2md/
lib.rs

1use std::boxed::Box;
2use std::collections::HashMap;
3use std::sync::LazyLock;
4
5use std::ffi::{CStr, CString};
6use std::os::raw::c_char;
7
8use regex::Regex;
9
10use html5ever::driver::ParseOpts;
11use html5ever::parse_document;
12use html5ever::tendril::TendrilSink;
13
14pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
15
16pub mod anchors;
17pub mod codes;
18pub mod common;
19pub mod containers;
20pub mod dummy;
21pub mod headers;
22pub mod iframes;
23pub mod images;
24pub mod lists;
25pub mod markup5ever_rcdom;
26pub mod paragraphs;
27pub mod quotes;
28pub mod styles;
29pub mod tables;
30
31use crate::anchors::AnchorHandler;
32use crate::codes::CodeHandler;
33use crate::containers::ContainerHandler;
34use crate::dummy::DummyHandler;
35use crate::dummy::HtmlCherryPickHandler;
36use crate::dummy::IdentityHandler;
37use crate::headers::HeaderHandler;
38use crate::iframes::IframeHandler;
39use crate::images::ImgHandler;
40use crate::lists::ListHandler;
41use crate::lists::ListItemHandler;
42use crate::paragraphs::ParagraphHandler;
43use crate::quotes::QuoteHandler;
44use crate::styles::StyleHandler;
45use crate::tables::TableHandler;
46
47static EXCESSIVE_WHITESPACE_PATTERN: LazyLock<Regex> =
48    LazyLock::new(|| Regex::new("\\s{2,}").unwrap()); // for HTML on-the-fly cleanup
49static EMPTY_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?m)^ +$").unwrap()); // for Markdown post-processing
50static EXCESSIVE_NEWLINE_PATTERN: LazyLock<Regex> =
51    LazyLock::new(|| Regex::new("\\n{3,}").unwrap()); // for Markdown post-processing
52static TRAILING_SPACE_PATTERN: LazyLock<Regex> =
53    LazyLock::new(|| Regex::new("(?m)(\\S) $").unwrap()); // for Markdown post-processing
54static LEADING_NEWLINES_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("^\\n+").unwrap()); // for Markdown post-processing
55static LAST_WHITESPACE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("\\s+$").unwrap()); // for Markdown post-processing
56static START_OF_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(^|\\n) *$").unwrap()); // for Markdown escaping
57static MARKDOWN_STARTONLY_KEYCHARS: LazyLock<Regex> =
58    LazyLock::new(|| Regex::new(r"^(\s*)([=>+\-#])").unwrap()); // for Markdown escaping
59static MARKDOWN_MIDDLE_KEYCHARS: LazyLock<Regex> =
60    LazyLock::new(|| Regex::new(r"[<>*\\_~]").unwrap()); // for Markdown escaping
61
62/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
63/// in order to register custom tag handler for tags you want.
64///
65/// You can also override standard tag handlers this way
66/// # Arguments
67/// `html` is source HTML as `String`
68/// `custom` is custom tag handler producers for tags you want, can be empty
69pub fn parse_html_custom(
70    html: &str,
71    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
72) -> String {
73    let dom = parse_document(RcDom::default(), ParseOpts::default())
74        .from_utf8()
75        .read_from(&mut html.as_bytes())
76        .unwrap();
77    let mut result = StructuredPrinter::default();
78    walk(&dom.document, &mut result, custom);
79
80    clean_markdown(&result.data)
81}
82
83/// Main function of this library. Parses incoming HTML, converts it into Markdown
84/// and returns converted string.
85/// # Arguments
86/// `html` is source HTML as `String`
87pub fn parse_html(html: &str) -> String {
88    parse_html_custom(html, &HashMap::default())
89}
90
91/// Same as `parse_html` but retains all "span" html elements intact
92/// Markdown parsers usually strip them down when rendering but they
93/// may be useful for later processing
94pub fn parse_html_extended(html: &str) -> String {
95    struct SpanAsIsTagFactory;
96    impl TagHandlerFactory for SpanAsIsTagFactory {
97        fn instantiate(&self) -> Box<dyn TagHandler> {
98            Box::new(HtmlCherryPickHandler::default())
99        }
100    }
101
102    let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
103    tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
104    parse_html_custom(html, &tag_factory)
105}
106
107/// Recursively walk through all DOM tree and handle all elements according to
108/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules.
109///
110/// # Arguments
111/// `input` is DOM tree or its subtree
112/// `result` is output holder with position and context tracking
113/// `custom` is custom tag hadler producers for tags you want, can be empty
114fn walk(
115    input: &Handle,
116    result: &mut StructuredPrinter,
117    custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
118) {
119    let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
120    let mut tag_name = String::default();
121    match input.data {
122        NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
123        NodeData::Text { ref contents } => {
124            let mut text = contents.borrow().to_string();
125            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
126            if inside_pre {
127                // this is preformatted text, insert as it is
128                result.append_str(&text);
129            } else if !(text.trim().is_empty()
130                && (result.data.ends_with('\n') || result.data.ends_with(' ')))
131            {
132                // in case it's not just a whitespace after the newline or another whitespace
133
134                // regular text, collapse whitespace and newlines in text
135                let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
136                if !inside_code {
137                    text = escape_markdown(result, &text);
138                }
139                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
140                let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
141                result.append_str(minified_text);
142            }
143        }
144        NodeData::Comment { .. } => {} // ignore comments
145        NodeData::Element { ref name, .. } => {
146            tag_name = name.local.to_string();
147            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
148            if inside_pre {
149                // don't add any html tags inside the pre section
150                handler = Box::new(DummyHandler);
151            } else if custom.contains_key(&tag_name) {
152                // have user-supplied factory, instantiate a handler for this tag
153                let factory = custom.get(&tag_name).unwrap();
154                handler = factory.instantiate();
155            } else {
156                // no user-supplied factory, take one of built-in ones
157                handler = match tag_name.as_ref() {
158                    // containers
159                    "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
160                    // pagination, breaks
161                    "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
162                    "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
163                    // spoiler tag
164                    "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
165                    // formatting
166                    "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
167                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
168                    "pre" | "code" => Box::new(CodeHandler::default()),
169                    // images, links
170                    "img" => Box::new(ImgHandler::default()),
171                    "a" => Box::new(AnchorHandler::default()),
172                    // lists
173                    "ol" | "ul" | "menu" => Box::new(ListHandler),
174                    "li" => Box::new(ListItemHandler::default()),
175                    // as-is
176                    "sub" | "sup" => Box::new(IdentityHandler),
177                    // tables, handled fully internally as markdown can't have nested content in tables
178                    // supports only single tables as of now
179                    "table" => Box::new(TableHandler),
180                    "iframe" => Box::new(IframeHandler),
181                    // other
182                    "html" | "head" | "body" => Box::new(DummyHandler),
183                    _ => Box::new(DummyHandler),
184                };
185            }
186        }
187    }
188
189    // handle this tag, while it's not in parent chain
190    // and doesn't have child siblings
191    handler.handle(input, result);
192
193    // save this tag name as parent for child nodes
194    result.parent_chain.push(tag_name.to_string()); // e.g. it was ["body"] and now it's ["body", "p"]
195    let current_depth = result.parent_chain.len(); // e.g. it was 1 and now it's 2
196
197    // create space for siblings of next level
198    result.siblings.insert(current_depth, vec![]);
199
200    for child in input.children.borrow().iter() {
201        if handler.skip_descendants() {
202            continue;
203        }
204
205        walk(child, result, custom);
206
207        if let NodeData::Element { ref name, .. } = child.data {
208            result
209                .siblings
210                .get_mut(&current_depth)
211                .unwrap()
212                .push(name.local.to_string())
213        };
214    }
215
216    // clear siblings of next level
217    result.siblings.remove(&current_depth);
218
219    // release parent tag
220    result.parent_chain.pop();
221
222    // finish handling of tag - parent chain now doesn't contain this tag itself again
223    handler.after_handle(result);
224}
225
226/// This conversion should only be applied to text tags
227///
228/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
229/// like list start or bold text style
230fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
231    // always escape bold/italic/strikethrough
232    let mut data = MARKDOWN_MIDDLE_KEYCHARS
233        .replace_all(text, "\\$0")
234        .to_string();
235
236    // if we're at the start of the line we need to escape list- and quote-starting sequences
237    if START_OF_LINE_PATTERN.is_match(&result.data) {
238        data = MARKDOWN_STARTONLY_KEYCHARS
239            .replace(&data, "$1\\$2")
240            .to_string();
241    }
242
243    // no handling of more complicated cases such as
244    // ![] or []() ones, for now this will suffice
245    data
246}
247
248/// Called after all processing has been finished
249///
250/// Clears excessive punctuation that would be trimmed by renderer anyway
251fn clean_markdown(text: &str) -> String {
252    // remove redundant newlines
253    let intermediate = EMPTY_LINE_PATTERN.replace_all(text, ""); // empty line with trailing spaces, replace with just newline
254    let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n"); // > 3 newlines - not handled by markdown anyway
255    let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1"); // trim space if it's just one
256    let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, ""); // trim leading newlines
257    let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, ""); // trim last newlines
258
259    intermediate.into_owned()
260}
261
262/// Intermediate result of HTML -> Markdown conversion.
263///
264/// Holds context in the form of parent tags and siblings chain
265/// and resulting string of markup content with the current position.
266#[derive(Debug, Default)]
267pub struct StructuredPrinter {
268    /// Chain of parents leading to upmost <html> tag
269    pub parent_chain: Vec<String>,
270
271    /// Siblings of currently processed tag in order where they're appearing in html
272    pub siblings: HashMap<usize, Vec<String>>,
273
274    /// resulting markdown document
275    pub data: String,
276}
277
278impl StructuredPrinter {
279    /// Inserts newline
280    pub fn insert_newline(&mut self) {
281        self.append_str("\n");
282    }
283
284    /// Append string to the end of the printer
285    pub fn append_str(&mut self, it: &str) {
286        self.data.push_str(it);
287    }
288
289    /// Insert string at specified position of printer, adjust position to the end of inserted string
290    pub fn insert_str(&mut self, pos: usize, it: &str) {
291        self.data.insert_str(pos, it);
292    }
293}
294
295/// Tag handler factory. This class is required in providing proper
296/// custom tag parsing capabilities to users of this library.
297///
298/// The problem with directly providing tag handlers is that they're not stateless.
299/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc.
300/// The only way to create fresh tag handler for each tag is to provide a factory like this one.
301///
302pub trait TagHandlerFactory {
303    fn instantiate(&self) -> Box<dyn TagHandler>;
304}
305
306/// Trait interface describing abstract handler of arbitrary HTML tag.
307pub trait TagHandler {
308    /// Handle tag encountered when walking HTML tree.
309    /// This is executed before the children processing
310    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
311
312    /// Executed after all children of this tag have been processed
313    fn after_handle(&mut self, printer: &mut StructuredPrinter);
314
315    fn skip_descendants(&self) -> bool {
316        false
317    }
318}
319
320/// FFI variant for HTML -> Markdown conversion for calling from other languages
321#[unsafe(no_mangle)]
322#[allow(clippy::not_unsafe_ptr_arg_deref)]
323pub extern "C" fn parse(html: *const c_char) -> *const c_char {
324    let in_html = unsafe { CStr::from_ptr(html) };
325    let out_md = parse_html(&in_html.to_string_lossy());
326
327    CString::new(out_md).unwrap().into_raw()
328}
329
330/// Expose the JNI interface for android below
331#[cfg(target_os = "android")]
332#[allow(non_snake_case)]
333pub mod android {
334    extern crate jni;
335
336    use super::parse_html;
337    use super::parse_html_extended;
338
339    use self::jni::JNIEnv;
340    use self::jni::objects::{JClass, JString};
341    use self::jni::sys::jstring;
342
343    #[no_mangle]
344    pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parse(
345        env: JNIEnv,
346        _clazz: JClass,
347        html: JString,
348    ) -> jstring {
349        let html_java: String = env
350            .get_string(html)
351            .expect("Couldn't get java string!")
352            .into();
353        let markdown = parse_html(&html_java);
354        let output = env
355            .new_string(markdown)
356            .expect("Couldn't create java string!");
357        output.into_inner()
358    }
359
360    #[no_mangle]
361    pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(
362        env: JNIEnv,
363        _clazz: JClass,
364        html: JString,
365    ) -> jstring {
366        let html_java: String = env
367            .get_string(html)
368            .expect("Couldn't get java string!")
369            .into();
370        let markdown = parse_html_extended(&html_java);
371        let output = env
372            .new_string(markdown)
373            .expect("Couldn't create java string!");
374        output.into_inner()
375    }
376}