1use std::boxed::Box;
2use std::collections::HashMap;
3use std::sync::LazyLock;
4
5use std::ffi::{CStr, CString};
6use std::os::raw::c_char;
7
8use regex::Regex;
9
10use html5ever::driver::ParseOpts;
11use html5ever::parse_document;
12use html5ever::tendril::TendrilSink;
13
14pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
15
16pub mod anchors;
17pub mod codes;
18pub mod common;
19pub mod containers;
20pub mod dummy;
21pub mod headers;
22pub mod iframes;
23pub mod images;
24pub mod lists;
25pub mod paragraphs;
26pub mod quotes;
27pub mod styles;
28pub mod tables;
29
30use crate::anchors::AnchorHandler;
31use crate::codes::CodeHandler;
32use crate::containers::ContainerHandler;
33use crate::dummy::DummyHandler;
34use crate::dummy::HtmlCherryPickHandler;
35use crate::dummy::IdentityHandler;
36use crate::headers::HeaderHandler;
37use crate::iframes::IframeHandler;
38use crate::images::ImgHandler;
39use crate::lists::ListHandler;
40use crate::lists::ListItemHandler;
41use crate::paragraphs::ParagraphHandler;
42use crate::quotes::QuoteHandler;
43use crate::styles::StyleHandler;
44use crate::tables::TableHandler;
45
46static EXCESSIVE_WHITESPACE_PATTERN: LazyLock<Regex> =
47 LazyLock::new(|| Regex::new("\\s{2,}").unwrap()); static EMPTY_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?m)^ +$").unwrap()); static EXCESSIVE_NEWLINE_PATTERN: LazyLock<Regex> =
50 LazyLock::new(|| Regex::new("\\n{3,}").unwrap()); static TRAILING_SPACE_PATTERN: LazyLock<Regex> =
52 LazyLock::new(|| Regex::new("(?m)(\\S) $").unwrap()); static LEADING_NEWLINES_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("^\\n+").unwrap()); static LAST_WHITESPACE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("\\s+$").unwrap()); static START_OF_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(^|\\n) *$").unwrap()); static MARKDOWN_STARTONLY_KEYCHARS: LazyLock<Regex> =
57 LazyLock::new(|| Regex::new(r"^(\s*)([=>+\-#])").unwrap()); static MARKDOWN_MIDDLE_KEYCHARS: LazyLock<Regex> =
59 LazyLock::new(|| Regex::new(r"[<>*\\_~]").unwrap()); pub fn parse_html_custom(
69 html: &str,
70 custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
71) -> String {
72 let dom = parse_document(RcDom::default(), ParseOpts::default())
73 .from_utf8()
74 .read_from(&mut html.as_bytes())
75 .unwrap();
76 let mut result = StructuredPrinter::default();
77 walk(&dom.document, &mut result, custom);
78
79 clean_markdown(&result.data)
80}
81
82pub fn parse_html(html: &str) -> String {
87 parse_html_custom(html, &HashMap::default())
88}
89
90pub fn parse_html_extended(html: &str) -> String {
94 struct SpanAsIsTagFactory;
95 impl TagHandlerFactory for SpanAsIsTagFactory {
96 fn instantiate(&self) -> Box<dyn TagHandler> {
97 Box::new(HtmlCherryPickHandler::default())
98 }
99 }
100
101 let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
102 tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
103 parse_html_custom(html, &tag_factory)
104}
105
106fn walk(
114 input: &Handle,
115 result: &mut StructuredPrinter,
116 custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
117) {
118 let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
119 let mut tag_name = String::default();
120 match input.data {
121 NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
122 NodeData::Text { ref contents } => {
123 let mut text = contents.borrow().to_string();
124 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
125 if inside_pre {
126 result.append_str(&text);
128 } else if !(text.trim().is_empty()
129 && (result.data.ends_with('\n') || result.data.ends_with(' ')))
130 {
131 let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
135 if !inside_code {
136 text = escape_markdown(result, &text);
137 }
138 let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
139 let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
140 result.append_str(minified_text);
141 }
142 }
143 NodeData::Comment { .. } => {} NodeData::Element { ref name, .. } => {
145 tag_name = name.local.to_string();
146 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
147 if inside_pre {
148 handler = Box::new(DummyHandler);
150 } else if custom.contains_key(&tag_name) {
151 let factory = custom.get(&tag_name).unwrap();
153 handler = factory.instantiate();
154 } else {
155 handler = match tag_name.as_ref() {
157 "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
159 "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
161 "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
162 "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
164 "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
166 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
167 "pre" | "code" => Box::new(CodeHandler::default()),
168 "img" => Box::new(ImgHandler::default()),
170 "a" => Box::new(AnchorHandler::default()),
171 "ol" | "ul" | "menu" => Box::new(ListHandler),
173 "li" => Box::new(ListItemHandler::default()),
174 "sub" | "sup" => Box::new(IdentityHandler),
176 "table" => Box::new(TableHandler),
179 "iframe" => Box::new(IframeHandler),
180 "html" | "head" | "body" => Box::new(DummyHandler),
182 _ => Box::new(DummyHandler),
183 };
184 }
185 }
186 }
187
188 handler.handle(input, result);
191
192 result.parent_chain.push(tag_name.to_string()); let current_depth = result.parent_chain.len(); result.siblings.insert(current_depth, vec![]);
198
199 for child in input.children.borrow().iter() {
200 if handler.skip_descendants() {
201 continue;
202 }
203
204 walk(child, result, custom);
205
206 if let NodeData::Element { ref name, .. } = child.data {
207 result
208 .siblings
209 .get_mut(¤t_depth)
210 .unwrap()
211 .push(name.local.to_string())
212 };
213 }
214
215 result.siblings.remove(¤t_depth);
217
218 result.parent_chain.pop();
220
221 handler.after_handle(result);
223}
224
225fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
230 let mut data = MARKDOWN_MIDDLE_KEYCHARS
232 .replace_all(text, "\\$0")
233 .to_string();
234
235 if START_OF_LINE_PATTERN.is_match(&result.data) {
237 data = MARKDOWN_STARTONLY_KEYCHARS
238 .replace(&data, "$1\\$2")
239 .to_string();
240 }
241
242 data
245}
246
247fn clean_markdown(text: &str) -> String {
251 let intermediate = EMPTY_LINE_PATTERN.replace_all(text, ""); let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n"); let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1"); let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, ""); let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, ""); intermediate.into_owned()
259}
260
261#[derive(Debug, Default)]
266pub struct StructuredPrinter {
267 pub parent_chain: Vec<String>,
269
270 pub siblings: HashMap<usize, Vec<String>>,
272
273 pub data: String,
275}
276
277impl StructuredPrinter {
278 pub fn insert_newline(&mut self) {
280 self.append_str("\n");
281 }
282
283 pub fn append_str(&mut self, it: &str) {
285 self.data.push_str(it);
286 }
287
288 pub fn insert_str(&mut self, pos: usize, it: &str) {
290 self.data.insert_str(pos, it);
291 }
292}
293
294pub trait TagHandlerFactory {
302 fn instantiate(&self) -> Box<dyn TagHandler>;
303}
304
305pub trait TagHandler {
307 fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
310
311 fn after_handle(&mut self, printer: &mut StructuredPrinter);
313
314 fn skip_descendants(&self) -> bool {
315 false
316 }
317}
318
319#[no_mangle]
321#[allow(clippy::not_unsafe_ptr_arg_deref)]
322pub extern "C" fn parse(html: *const c_char) -> *const c_char {
323 let in_html = unsafe { CStr::from_ptr(html) };
324 let out_md = parse_html(&in_html.to_string_lossy());
325
326 CString::new(out_md).unwrap().into_raw()
327}
328
329#[cfg(target_os = "android")]
331#[allow(non_snake_case)]
332pub mod android {
333 extern crate jni;
334
335 use super::parse_html;
336 use super::parse_html_extended;
337
338 use self::jni::objects::{JClass, JString};
339 use self::jni::sys::jstring;
340 use self::jni::JNIEnv;
341
342 #[no_mangle]
343 pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parse(
344 env: JNIEnv,
345 _clazz: JClass,
346 html: JString,
347 ) -> jstring {
348 let html_java: String = env
349 .get_string(html)
350 .expect("Couldn't get java string!")
351 .into();
352 let markdown = parse_html(&html_java);
353 let output = env
354 .new_string(markdown)
355 .expect("Couldn't create java string!");
356 output.into_inner()
357 }
358
359 #[no_mangle]
360 pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(
361 env: JNIEnv,
362 _clazz: JClass,
363 html: JString,
364 ) -> jstring {
365 let html_java: String = env
366 .get_string(html)
367 .expect("Couldn't get java string!")
368 .into();
369 let markdown = parse_html_extended(&html_java);
370 let output = env
371 .new_string(markdown)
372 .expect("Couldn't create java string!");
373 output.into_inner()
374 }
375}