1use lazy_static::lazy_static;
2
3use std::boxed::Box;
4use std::borrow::Borrow;
5use std::collections::HashMap;
6
7use std::os::raw::{c_char};
8use std::ffi::{CString, CStr};
9
10use regex::Regex;
11
12use html5ever::parse_document;
13use html5ever::driver::ParseOpts;
14use html5ever::tendril::TendrilSink;
15
16pub use markup5ever_rcdom::{RcDom, Handle, NodeData};
17
18pub mod common;
19pub mod dummy;
20pub mod anchors;
21pub mod paragraphs;
22pub mod images;
23pub mod headers;
24pub mod lists;
25pub mod styles;
26pub mod codes;
27pub mod quotes;
28pub mod tables;
29pub mod containers;
30pub mod iframes;
31
32use crate::dummy::DummyHandler;
33use crate::dummy::IdentityHandler;
34use crate::dummy::HtmlCherryPickHandler;
35use crate::paragraphs::ParagraphHandler;
36use crate::anchors::AnchorHandler;
37use crate::images::ImgHandler;
38use crate::headers::HeaderHandler;
39use crate::lists::ListItemHandler;
40use crate::lists::ListHandler;
41use crate::styles::StyleHandler;
42use crate::codes::CodeHandler;
43use crate::quotes::QuoteHandler;
44use crate::tables::TableHandler;
45use crate::containers::ContainerHandler;
46use crate::iframes::IframeHandler;
47
48lazy_static! {
49 static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").unwrap(); static ref EMPTY_LINE_PATTERN: Regex = Regex::new("(?m)^ +$").unwrap(); static ref EXCESSIVE_NEWLINE_PATTERN: Regex = Regex::new("\\n{3,}").unwrap(); static ref TRAILING_SPACE_PATTERN: Regex = Regex::new("(?m)(\\S) $").unwrap(); static ref LEADING_NEWLINES_PATTERN: Regex = Regex::new("^\\n+").unwrap(); static ref LAST_WHITESPACE_PATTERN: Regex = Regex::new("\\s+$").unwrap(); static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").unwrap(); static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").unwrap(); static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").unwrap(); }
61
62pub fn parse_html_custom(html: &str, custom: &HashMap<String, Box<dyn TagHandlerFactory>>) -> String {
70 let dom = parse_document(RcDom::default(), ParseOpts::default()).from_utf8().read_from(&mut html.as_bytes()).unwrap();
71 let mut result = StructuredPrinter::default();
72 walk(&dom.document, &mut result, custom);
73
74 return clean_markdown(&result.data);
75}
76
77pub fn parse_html(html: &str) -> String {
82 parse_html_custom(html, &HashMap::default())
83}
84
85pub fn parse_html_extended(html: &str) -> String {
89 struct SpanAsIsTagFactory;
90 impl TagHandlerFactory for SpanAsIsTagFactory {
91 fn instantiate(&self) -> Box<dyn TagHandler> {
92 return Box::new(HtmlCherryPickHandler::default());
93 }
94 }
95
96 let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
97 tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory{}));
98 return parse_html_custom(html, &tag_factory);
99}
100
101fn walk(input: &Handle, result: &mut StructuredPrinter, custom: &HashMap<String, Box<dyn TagHandlerFactory>>) {
109 let mut handler : Box<dyn TagHandler> = Box::new(DummyHandler::default());
110 let mut tag_name = String::default();
111 match input.data {
112 NodeData::Document | NodeData::Doctype {..} | NodeData::ProcessingInstruction {..} => {},
113 NodeData::Text { ref contents } => {
114 let mut text = contents.borrow().to_string();
115 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
116 if inside_pre {
117 result.append_str(&text);
119 } else if !(text.trim().len() == 0 && (result.data.chars().last() == Some('\n') || result.data.chars().last() == Some(' '))) {
120 let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
124 if !inside_code {
125 text = escape_markdown(result, &text);
126 }
127 let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
128 let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
129 result.append_str(&minified_text);
130 }
131 }
132 NodeData::Comment { .. } => {}, NodeData::Element { ref name, .. } => {
134 tag_name = name.local.to_string();
135 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
136 if inside_pre {
137 handler = Box::new(DummyHandler::default());
139 }else if custom.contains_key(&tag_name) {
140 let factory = custom.get(&tag_name).unwrap();
142 handler = factory.instantiate();
143 } else {
144 handler = match tag_name.as_ref() {
146 "div" | "section" | "header" | "footer" => Box::new(ContainerHandler::default()),
148 "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
150 "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
151 "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
153 "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
155 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
156 "pre" | "code" => Box::new(CodeHandler::default()),
157 "img" => Box::new(ImgHandler::default()),
159 "a" => Box::new(AnchorHandler::default()),
160 "ol" | "ul" | "menu" => Box::new(ListHandler::default()),
162 "li" => Box::new(ListItemHandler::default()),
163 "sub" | "sup" => Box::new(IdentityHandler::default()),
165 "table" => Box::new(TableHandler::default()),
168 "iframe" => Box::new(IframeHandler::default()),
169 "html" | "head" | "body" => Box::new(DummyHandler::default()),
171 _ => Box::new(DummyHandler::default())
172 };
173 }
174 }
175 }
176
177 handler.handle(&input, result);
180
181 result.parent_chain.push(tag_name.to_string()); let current_depth = result.parent_chain.len(); result.siblings.insert(current_depth, vec![]);
187
188 for child in input.children.borrow().iter() {
189 if handler.skip_descendants() {
190 continue;
191 }
192
193 walk(child.borrow(), result, custom);
194
195 match child.data {
196 NodeData::Element { ref name, .. } => result.siblings.get_mut(¤t_depth).unwrap().push(name.local.to_string()),
197 _ => {}
198 };
199 }
200
201 result.siblings.remove(¤t_depth);
203
204 result.parent_chain.pop();
206
207 handler.after_handle(result);
209}
210
211fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
216 let mut data = MARKDOWN_MIDDLE_KEYCHARS.replace_all(&text, "\\$0").to_string();
218
219 if START_OF_LINE_PATTERN.is_match(&result.data) {
221 data = MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2").to_string();
222 }
223
224 return data;
227}
228
229fn clean_markdown(text: &str) -> String {
233 let intermediate = EMPTY_LINE_PATTERN.replace_all(&text, ""); let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n"); let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1"); let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, ""); let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, ""); return intermediate.into_owned();
241}
242
243#[derive(Debug, Default)]
248pub struct StructuredPrinter {
249 pub parent_chain: Vec<String>,
251
252 pub siblings: HashMap<usize, Vec<String>>,
254
255 pub data: String,
257}
258
259impl StructuredPrinter {
260
261 pub fn insert_newline(&mut self) {
263 self.append_str("\n");
264 }
265
266 pub fn append_str(&mut self, it: &str) {
268 self.data.push_str(it);
269 }
270
271 pub fn insert_str(&mut self, pos: usize, it: &str) {
273 self.data.insert_str(pos, it);
274 }
275}
276
277pub trait TagHandlerFactory {
285 fn instantiate(&self) -> Box<dyn TagHandler>;
286}
287
288pub trait TagHandler {
290 fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
293
294 fn after_handle(&mut self, printer: &mut StructuredPrinter);
296
297 fn skip_descendants(&self) -> bool {
298 return false;
299 }
300}
301
302#[no_mangle]
304pub extern fn parse(html: *const c_char) -> *const c_char {
305 let in_html = unsafe { CStr::from_ptr(html) };
306 let out_md = parse_html(&in_html.to_string_lossy());
307
308 CString::new(out_md).unwrap().into_raw()
309}
310
311#[cfg(target_os="android")]
313#[allow(non_snake_case)]
314pub mod android {
315 extern crate jni;
316
317 use super::parse_html;
318 use super::parse_html_extended;
319
320 use self::jni::JNIEnv;
321 use self::jni::objects::{JClass, JString};
322 use self::jni::sys::jstring;
323
324 #[no_mangle]
325 pub unsafe extern fn Java_com_kanedias_html2md_Html2Markdown_parse(env: JNIEnv, _clazz: JClass, html: JString) -> jstring {
326 let html_java : String = env.get_string(html).expect("Couldn't get java string!").into();
327 let markdown = parse_html(&html_java);
328 let output = env.new_string(markdown).expect("Couldn't create java string!");
329 output.into_inner()
330 }
331
332 #[no_mangle]
333 pub unsafe extern fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(env: JNIEnv, _clazz: JClass, html: JString) -> jstring {
334 let html_java : String = env.get_string(html).expect("Couldn't get java string!").into();
335 let markdown = parse_html_extended(&html_java);
336 let output = env.new_string(markdown).expect("Couldn't create java string!");
337 output.into_inner()
338 }
339}