1use std::boxed::Box;
2use std::collections::HashMap;
3use std::sync::LazyLock;
4
5use std::ffi::{CStr, CString};
6use std::os::raw::c_char;
7
8use regex::Regex;
9
10use html5ever::driver::ParseOpts;
11use html5ever::parse_document;
12use html5ever::tendril::TendrilSink;
13
14pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
15
16pub mod anchors;
17pub mod codes;
18pub mod common;
19pub mod containers;
20pub mod dummy;
21pub mod headers;
22pub mod iframes;
23pub mod images;
24pub mod lists;
25pub mod markup5ever_rcdom;
26pub mod paragraphs;
27pub mod quotes;
28pub mod styles;
29pub mod tables;
30
31use crate::anchors::AnchorHandler;
32use crate::codes::CodeHandler;
33use crate::containers::ContainerHandler;
34use crate::dummy::DummyHandler;
35use crate::dummy::HtmlCherryPickHandler;
36use crate::dummy::IdentityHandler;
37use crate::headers::HeaderHandler;
38use crate::iframes::IframeHandler;
39use crate::images::ImgHandler;
40use crate::lists::ListHandler;
41use crate::lists::ListItemHandler;
42use crate::paragraphs::ParagraphHandler;
43use crate::quotes::QuoteHandler;
44use crate::styles::StyleHandler;
45use crate::tables::TableHandler;
46
47static EXCESSIVE_WHITESPACE_PATTERN: LazyLock<Regex> =
48 LazyLock::new(|| Regex::new("\\s{2,}").unwrap()); static EMPTY_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?m)^ +$").unwrap()); static EXCESSIVE_NEWLINE_PATTERN: LazyLock<Regex> =
51 LazyLock::new(|| Regex::new("\\n{3,}").unwrap()); static TRAILING_SPACE_PATTERN: LazyLock<Regex> =
53 LazyLock::new(|| Regex::new("(?m)(\\S) $").unwrap()); static LEADING_NEWLINES_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("^\\n+").unwrap()); static LAST_WHITESPACE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("\\s+$").unwrap()); static START_OF_LINE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new("(^|\\n) *$").unwrap()); static MARKDOWN_STARTONLY_KEYCHARS: LazyLock<Regex> =
58 LazyLock::new(|| Regex::new(r"^(\s*)([=>+\-#])").unwrap()); static MARKDOWN_MIDDLE_KEYCHARS: LazyLock<Regex> =
60 LazyLock::new(|| Regex::new(r"[<>*\\_~]").unwrap()); pub fn parse_html_custom(
70 html: &str,
71 custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
72) -> String {
73 let dom = parse_document(RcDom::default(), ParseOpts::default())
74 .from_utf8()
75 .read_from(&mut html.as_bytes())
76 .unwrap();
77 let mut result = StructuredPrinter::default();
78 walk(&dom.document, &mut result, custom);
79
80 clean_markdown(&result.data)
81}
82
83pub fn parse_html(html: &str) -> String {
88 parse_html_custom(html, &HashMap::default())
89}
90
91pub fn parse_html_extended(html: &str) -> String {
95 struct SpanAsIsTagFactory;
96 impl TagHandlerFactory for SpanAsIsTagFactory {
97 fn instantiate(&self) -> Box<dyn TagHandler> {
98 Box::new(HtmlCherryPickHandler::default())
99 }
100 }
101
102 let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
103 tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
104 parse_html_custom(html, &tag_factory)
105}
106
107fn walk(
115 input: &Handle,
116 result: &mut StructuredPrinter,
117 custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
118) {
119 let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
120 let mut tag_name = String::default();
121 match input.data {
122 NodeData::Document | NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {}
123 NodeData::Text { ref contents } => {
124 let mut text = contents.borrow().to_string();
125 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
126 if inside_pre {
127 result.append_str(&text);
129 } else if !(text.trim().is_empty()
130 && (result.data.ends_with('\n') || result.data.ends_with(' ')))
131 {
132 let inside_code = result.parent_chain.iter().any(|tag| tag == "code");
136 if !inside_code {
137 text = escape_markdown(result, &text);
138 }
139 let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
140 let minified_text = minified_text.trim_matches(|ch: char| ch == '\n' || ch == '\r');
141 result.append_str(minified_text);
142 }
143 }
144 NodeData::Comment { .. } => {} NodeData::Element { ref name, .. } => {
146 tag_name = name.local.to_string();
147 let inside_pre = result.parent_chain.iter().any(|tag| tag == "pre");
148 if inside_pre {
149 handler = Box::new(DummyHandler);
151 } else if custom.contains_key(&tag_name) {
152 let factory = custom.get(&tag_name).unwrap();
154 handler = factory.instantiate();
155 } else {
156 handler = match tag_name.as_ref() {
158 "div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
160 "p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
162 "q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
163 "details" | "summary" => Box::new(HtmlCherryPickHandler::default()),
165 "b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
167 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
168 "pre" | "code" => Box::new(CodeHandler::default()),
169 "img" => Box::new(ImgHandler::default()),
171 "a" => Box::new(AnchorHandler::default()),
172 "ol" | "ul" | "menu" => Box::new(ListHandler),
174 "li" => Box::new(ListItemHandler::default()),
175 "sub" | "sup" => Box::new(IdentityHandler),
177 "table" => Box::new(TableHandler),
180 "iframe" => Box::new(IframeHandler),
181 "html" | "head" | "body" => Box::new(DummyHandler),
183 _ => Box::new(DummyHandler),
184 };
185 }
186 }
187 }
188
189 handler.handle(input, result);
192
193 result.parent_chain.push(tag_name.to_string()); let current_depth = result.parent_chain.len(); result.siblings.insert(current_depth, vec![]);
199
200 for child in input.children.borrow().iter() {
201 if handler.skip_descendants() {
202 continue;
203 }
204
205 walk(child, result, custom);
206
207 if let NodeData::Element { ref name, .. } = child.data {
208 result
209 .siblings
210 .get_mut(¤t_depth)
211 .unwrap()
212 .push(name.local.to_string())
213 };
214 }
215
216 result.siblings.remove(¤t_depth);
218
219 result.parent_chain.pop();
221
222 handler.after_handle(result);
224}
225
226fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
231 let mut data = MARKDOWN_MIDDLE_KEYCHARS
233 .replace_all(text, "\\$0")
234 .to_string();
235
236 if START_OF_LINE_PATTERN.is_match(&result.data) {
238 data = MARKDOWN_STARTONLY_KEYCHARS
239 .replace(&data, "$1\\$2")
240 .to_string();
241 }
242
243 data
246}
247
248fn clean_markdown(text: &str) -> String {
252 let intermediate = EMPTY_LINE_PATTERN.replace_all(text, ""); let intermediate = EXCESSIVE_NEWLINE_PATTERN.replace_all(&intermediate, "\n\n"); let intermediate = TRAILING_SPACE_PATTERN.replace_all(&intermediate, "$1"); let intermediate = LEADING_NEWLINES_PATTERN.replace_all(&intermediate, ""); let intermediate = LAST_WHITESPACE_PATTERN.replace_all(&intermediate, ""); intermediate.into_owned()
260}
261
262#[derive(Debug, Default)]
267pub struct StructuredPrinter {
268 pub parent_chain: Vec<String>,
270
271 pub siblings: HashMap<usize, Vec<String>>,
273
274 pub data: String,
276}
277
278impl StructuredPrinter {
279 pub fn insert_newline(&mut self) {
281 self.append_str("\n");
282 }
283
284 pub fn append_str(&mut self, it: &str) {
286 self.data.push_str(it);
287 }
288
289 pub fn insert_str(&mut self, pos: usize, it: &str) {
291 self.data.insert_str(pos, it);
292 }
293}
294
295pub trait TagHandlerFactory {
303 fn instantiate(&self) -> Box<dyn TagHandler>;
304}
305
306pub trait TagHandler {
308 fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
311
312 fn after_handle(&mut self, printer: &mut StructuredPrinter);
314
315 fn skip_descendants(&self) -> bool {
316 false
317 }
318}
319
320#[unsafe(no_mangle)]
322#[allow(clippy::not_unsafe_ptr_arg_deref)]
323pub extern "C" fn parse(html: *const c_char) -> *const c_char {
324 let in_html = unsafe { CStr::from_ptr(html) };
325 let out_md = parse_html(&in_html.to_string_lossy());
326
327 CString::new(out_md).unwrap().into_raw()
328}
329
330#[cfg(target_os = "android")]
332#[allow(non_snake_case)]
333pub mod android {
334 extern crate jni;
335
336 use super::parse_html;
337 use super::parse_html_extended;
338
339 use self::jni::JNIEnv;
340 use self::jni::objects::{JClass, JString};
341 use self::jni::sys::jstring;
342
343 #[no_mangle]
344 pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parse(
345 env: JNIEnv,
346 _clazz: JClass,
347 html: JString,
348 ) -> jstring {
349 let html_java: String = env
350 .get_string(html)
351 .expect("Couldn't get java string!")
352 .into();
353 let markdown = parse_html(&html_java);
354 let output = env
355 .new_string(markdown)
356 .expect("Couldn't create java string!");
357 output.into_inner()
358 }
359
360 #[no_mangle]
361 pub unsafe extern "C" fn Java_com_kanedias_html2md_Html2Markdown_parseExtended(
362 env: JNIEnv,
363 _clazz: JClass,
364 html: JString,
365 ) -> jstring {
366 let html_java: String = env
367 .get_string(html)
368 .expect("Couldn't get java string!")
369 .into();
370 let markdown = parse_html_extended(&html_java);
371 let output = env
372 .new_string(markdown)
373 .expect("Couldn't create java string!");
374 output.into_inner()
375 }
376}