rhtml2md/
lib.rs

1use lazy_static::lazy_static;
2
3use std::boxed::Box;
4use std::borrow::Borrow;
5use std::collections::HashMap;
6
7use std::os::raw::{c_char};
8use std::ffi::{CString, CStr};
9
10use regex::Regex;
11
12use html5ever::parse_document;
13use html5ever::driver::ParseOpts;
14use html5ever::tendril::TendrilSink;
15
16pub use markup5ever_rcdom::{RcDom, Handle, NodeData};
17
18pub mod common;
19pub mod dummy;
20pub mod anchors;
21pub mod paragraphs;
22pub mod images;
23pub mod headers;
24pub mod lists;
25pub mod styles;
26pub mod codes;
27pub mod quotes;
28pub mod tables;
29pub mod containers;
30pub mod iframes;
31
32use crate::dummy::DummyHandler;
33use crate::dummy::IdentityHandler;
34use crate::dummy::HtmlCherryPickHandler;
35use crate::paragraphs::ParagraphHandler;
36use crate::anchors::AnchorHandler;
37use crate::images::ImgHandler;
38use crate::headers::HeaderHandler;
39use crate::lists::ListItemHandler;
40use crate::lists::ListHandler;
41use crate::styles::StyleHandler;
42use crate::codes::CodeHandler;
43use crate::quotes::QuoteHandler;
44use crate::tables::TableHandler;
45use crate::containers::ContainerHandler;
46use crate::iframes::IframeHandler;
47
48lazy_static! {
49    static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").unwrap();   // for HTML on-the-fly cleanup
50
51    static ref EMPTY_LINE_PATTERN: Regex = Regex::new("(?m)^ +$").unwrap();            // for Markdown post-processing
52    static ref EXCESSIVE_NEWLINE_PATTERN: Regex = Regex::new("\\n{3,}").unwrap();      // for Markdown post-processing
53    static ref TRAILING_SPACE_PATTERN: Regex = Regex::new("(?m)(\\S) $").unwrap();     // for Markdown post-processing
54    static ref LEADING_NEWLINES_PATTERN: Regex = Regex::new("^\\n+").unwrap();         // for Markdown post-processing
55    static ref LAST_WHITESPACE_PATTERN: Regex = Regex::new("\\s+$").unwrap();          // for Markdown post-processing
56
57    static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").unwrap();                  // for Markdown escaping
58    static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").unwrap();     // for Markdown escaping
59    static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").unwrap();               // for Markdown escaping
60}
61
62/// Custom variant of main function. Allows to pass custom tag<->tag factory pairs
63/// in order to register custom tag hadler for tags you want.
64///
65/// You can also override standard tag handlers this way
66/// # Arguments
67/// `html` is source HTML as `String`
68/// `custom` is custom tag hadler producers for tags you want, can be empty
69pub fn parse_html_custom(html: &str, custom: &HashMap<String, Box<dyn TagHandlerFactory>>) -> String {
70    let dom = parse_document(RcDom::default(), ParseOpts::default()).from_utf8().read_from(&mut html.as_bytes()).unwrap();
71    let mut result = StructuredPrinter::default();
72    walk(&dom.document, &mut result, custom);
73
74    return clean_markdown(&result.data);
75}
76
77/// Main function of this library. Parses incoming HTML, converts it into Markdown
78/// and returns converted string.
79/// # Arguments
80/// `html` is source HTML as `String`
81pub fn parse_html(html: &str) -> String {
82    parse_html_custom(html, &HashMap::default())
83}
84
85/// Same as `parse_html` but retains all "span" html elements intact
86/// Markdown parsers usually strip them down when rendering but they
87/// may be useful for later processing
88pub fn parse_html_extended(html: &str) -> String {
89    struct SpanAsIsTagFactory;
90    impl TagHandlerFactory for SpanAsIsTagFactory {
91        fn instantiate(&self) -> Box<dyn TagHandler> {
92            return Box::new(HtmlCherryPickHandler::default());
93        }
94    }
95
96    let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
97    tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory{}));
98    return parse_html_custom(html, &tag_factory);
99}
100
101/// Recursively walk through all DOM tree and handle all elements according to
102/// HTML tag -> Markdown syntax mapping. Text content is trimmed to one whitespace according to HTML5 rules.
103///
104/// # Arguments
105/// `input` is DOM tree or its subtree
106/// `result` is output holder with position and context tracking
107/// `custom` is custom tag hadler producers for tags you want, can be empty
108fn walk(input: &Handle, result: &mut StructuredPrinter, custom: &HashMap<String, Box<dyn TagHandlerFactory>>) {
109    let mut handler : Box<dyn TagHandler> = Box::new(DummyHandler::default());
110    let mut tag_name = String::default();
111    match input.data {
112        NodeData::Document | NodeData::Doctype {..} | NodeData::ProcessingInstruction {..} => {},
113        NodeData::Text { ref contents }  => {
114            let mut text = contents.borrow().to_string();
115            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
116            if inside_pre {
117                // this is preformatted text, insert as-is
118                result.append_str(&text);
119            } else if !(text.trim().len() == 0 && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) {
120                // in case it's not just a whitespace after the newline or another whitespace
121
122                // regular text, collapse whitespace and newlines in text
123                let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
124                if !inside_code {
125                    text = escape_markdown(result, &text);
126                }
127                let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
128                let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
129                result.append_str(&minified_text);
130            }
131        }
132        NodeData::Comment { .. } => {}, // ignore comments
133        NodeData::Element { ref name, .. } => {
134            tag_name = name.local.to_string();
135            let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
136            if inside_pre {
137                // don't add any html tags inside the pre section
138                handler = Box::new(DummyHandler::default());
139            }else if custom.contains_key(&tag_name) {
140                // have user-supplied factory, instantiate a handler for this tag
141                let factory = custom.get(&tag_name).unwrap();
142                handler = factory.instantiate();
143            } else {
144                // no user-supplied factory, take one of built-in ones
145                handler = match tag_name.as_ref() {
146                    // containers
147                    "div" | "section" | "header" | "footer" => Box::new(ContainerHandler::default()),
148                    // pagination, breaks
149                    "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
150                    "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
151                    // spoiler tag
152                    "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
153                    // formatting
154                    "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
155                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
156                    "pre" | "code" => Box::new(CodeHandler::default()),
157                    // images, links
158                    "img" => Box::new(ImgHandler::default()),
159                    "a" => Box::new(AnchorHandler::default()),
160                    // lists
161                    "ol" | "ul" | "menu" => Box::new(ListHandler::default()),
162                    "li" => Box::new(ListItemHandler::default()),
163                    // as-is
164                    "sub" | "sup" => Box::new(IdentityHandler::default()),
165                    // tables, handled fully internally as markdown can't have nested content in tables
166                    // supports only single tables as of now
167                    "table" => Box::new(TableHandler::default()),
168                    "iframe" => Box::new(IframeHandler::default()),
169                    // other
170                    "html" | "head" | "body" => Box::new(DummyHandler::default()),
171                    _ => Box::new(DummyHandler::default())
172                };
173            }
174        }
175    }
176
177    // handle this tag, while it's not in parent chain
178    // and doesn't have child siblings
179    handler.handle(&input, result);
180
181    // save this tag name as parent for child nodes
182    result.parent_chain.push(tag_name.to_string());     // e.g. it was ["body"] and now it's ["body", "p"]
183    let current_depth = result.parent_chain.len();      // e.g. it was 1 and now it's 2
184
185    // create space for siblings of next level
186    result.siblings.insert(current_depth, vec![]);
187
188    for child in input.children.borrow().iter() {
189        if handler.skip_descendants() {
190            continue;
191        }
192
193        walk(child.borrow(), result, custom);
194
195        match child.data {
196            NodeData::Element { ref name, .. } => result.siblings.get_mut(&current_depth).unwrap().push(name.local.to_string()),
197            _ => {}
198        };
199    }
200
201    // clear siblings of next level
202    result.siblings.remove(&current_depth);
203
204    // release parent tag
205    result.parent_chain.pop();
206
207    // finish handling of tag - parent chain now doesn't contain this tag itself again
208    handler.after_handle(result);
209}
210
211/// This conversion should only be applied to text tags
212///
213/// Escapes text inside HTML tags so it won't be recognized as Markdown control sequence
214/// like list start or bold text style
215fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
216    // always escape bold/italic/strikethrough
217    let mut data = MARKDOWN_MIDDLE_KEYCHARS.replace_all(&text, "\\$0").to_string();
218
219    // if we're at the start of the line we need to escape list- and quote-starting sequences
220    if START_OF_LINE_PATTERN.is_match(&result.data) {
221        data = MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2").to_string();
222    }
223
224    // no handling of more complicated cases such as
225    // ![] or []() ones, for now this will suffice
226    return data;
227}
228
229/// Called after all processing has been finished
230///
231/// Clears excessive punctuation that would be trimmed by renderer anyway
232fn clean_markdown(text: &str) -> String {
233    // remove redundant newlines
234    let intermediate = EMPTY_LINE_PATTERN.replace_all(&text, "");                           // empty line with trailing spaces, replace with just newline
235    let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n");  // > 3 newlines - not handled by markdown anyway
236    let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1");       // trim space if it's just one
237    let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, "");       // trim leading newlines
238    let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, "");        // trim last newlines
239
240    return intermediate.into_owned();
241}
242
243/// Intermediate result of HTML -> Markdown conversion.
244///
245/// Holds context in the form of parent tags and siblings chain
246/// and resulting string of markup content with current position.
247#[derive(Debug, Default)]
248pub struct StructuredPrinter {
249    /// Chain of parents leading to upmost <html> tag
250    pub parent_chain: Vec<String>,
251
252    /// Siblings of currently processed tag in order where they're appearing in html
253    pub siblings: HashMap<usize, Vec<String>>,
254
255    /// resulting markdown document
256    pub data: String,
257}
258
259impl StructuredPrinter {
260
261    /// Inserts newline
262    pub fn insert_newline(&mut self) {
263        self.append_str("\n");
264    }
265
266    /// Append string to the end of the printer
267    pub fn append_str(&mut self, it: &str) {
268        self.data.push_str(it);
269    }
270
271    /// Insert string at specified position of printer, adjust position to the end of inserted string
272    pub fn insert_str(&mut self, pos: usize, it: &str) {
273        self.data.insert_str(pos, it);
274    }
275}
276
277/// Tag handler factory. This class is required in providing proper
278/// custom tag parsing capabilities to users of this library.
279///
280/// The problem with directly providing tag handlers is that they're not stateless.
281/// Once tag handler is parsing some tag, it holds data, such as start position, indent etc.
282/// The only way to create fresh tag handler for each tag is to provide a factory like this one.
283///
284pub trait TagHandlerFactory {
285    fn instantiate(&self) -> Box<dyn TagHandler>;
286}
287
288/// Trait interface describing abstract handler of arbitrary HTML tag.
289pub trait TagHandler {
290    /// Handle tag encountered when walking HTML tree.
291    /// This is executed before the children processing
292    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
293
294    /// Executed after all children of this tag have been processed
295    fn after_handle(&mut self, printer: &mut StructuredPrinter);
296
297    fn skip_descendants(&self) -> bool {
298        return false;
299    }
300}
301
302/// FFI variant for HTML -> Markdown conversion for calling from other languages
303#[no_mangle]
304pub extern fn parse(html: *const c_char) -> *const c_char {
305    let in_html = unsafe { CStr::from_ptr(html) };
306    let out_md = parse_html(&in_html.to_string_lossy());
307
308    CString::new(out_md).unwrap().into_raw()
309}
310
311/// Expose the JNI interface for android below
312#[cfg(target_os="android")]
313#[allow(non_snake_case)]
314pub mod android {
315    extern crate jni;
316
317    use super::parse_html;
318    use super::parse_html_extended;
319
320    use self::jni::JNIEnv;
321    use self::jni::objects::{JClass, JString};
322    use self::jni::sys::jstring;
323
324    #[no_mangle]
325    pub unsafe extern fn Java_com_kanedias_html2md_Html2Markdown_parse(env: JNIEnv, _clazz: JClass, html: JString) -> jstring {
326        let html_java : String = env.get_string(html).expect("Couldn't get java string!").into();
327        let markdown = parse_html(&html_java);
328        let output = env.new_string(markdown).expect("Couldn't create java string!");
329        output.into_inner()
330    }
331
332    #[no_mangle]
333    pub unsafe extern fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(env: JNIEnv, _clazz: JClass, html: JString) -> jstring {
334        let html_java : String = env.get_string(html).expect("Couldn't get java string!").into();
335        let markdown = parse_html_extended(&html_java);
336        let output = env.new_string(markdown).expect("Couldn't create java string!");
337        output.into_inner()
338    }
339}