pub mod anchors;
pub mod codes;
pub mod common;
pub mod containers;
pub mod dummy;
pub mod headers;
pub mod iframes;
pub mod ignore;
pub mod images;
pub mod lists;
pub mod paragraphs;
pub mod quotes;
pub mod styles;
pub mod tables;
pub mod utils;
use super::clean_markdown;
use anchors::AnchorHandler;
use codes::CodeHandler;
use containers::ContainerHandler;
use dummy::DummyHandler;
use dummy::HtmlCherryPickHandler;
use dummy::IdentityHandler;
use headers::HeaderHandler;
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use iframes::IframeHandler;
use images::ImgHandler;
use lazy_static::lazy_static;
use lists::ListHandler;
use lists::ListItemHandler;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use paragraphs::ParagraphHandler;
use quotes::QuoteHandler;
use regex::Regex;
use std::boxed::Box;
use std::collections::HashMap;
use std::sync::Arc;
use styles::StyleHandler;
use tables::TableHandler;
use url::Url;
lazy_static! {
static ref EXCESSIVE_WHITESPACE_PATTERN: Regex = Regex::new("\\s{2,}").expect("valid regex pattern"); static ref START_OF_LINE_PATTERN: Regex = Regex::new("(^|\\n) *$").expect("valid regex pattern"); static ref MARKDOWN_STARTONLY_KEYCHARS: Regex = Regex::new(r"^(\s*)([=>+\-#])").expect("valid regex pattern"); }
pub fn parse_html_custom_base(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
let document_parser = parse_document(RcDom::default(), ParseOpts::default());
match document_parser.from_utf8().read_from(&mut html.as_bytes()) {
Ok(dom) => {
let mut result = Box::new(StructuredPrinter::default());
walk(
&dom.document,
&mut result,
custom,
commonmark,
&if let Some(u) = url {
Some(Arc::new(u.clone()))
} else {
None
},
false,
);
clean_markdown(&result.data)
}
_ => Default::default(),
}
}
pub fn parse_html_custom(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
) -> String {
parse_html_custom_base(html, custom, commonmark, &None)
}
pub fn parse_html_custom_with_url(
html: &str,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
parse_html_custom_base(html, custom, commonmark, &url)
}
pub fn parse_html(html: &str, commonmark: bool) -> String {
parse_html_custom(html, &HashMap::default(), commonmark)
}
pub fn parse_html_extended(html: &str, commonmark: bool) -> String {
struct SpanAsIsTagFactory;
impl TagHandlerFactory for SpanAsIsTagFactory {
fn instantiate(&self) -> Box<dyn TagHandler> {
Box::new(HtmlCherryPickHandler::default())
}
}
let mut tag_factory: HashMap<String, Box<dyn TagHandlerFactory>> = HashMap::new();
tag_factory.insert(String::from("span"), Box::new(SpanAsIsTagFactory {}));
parse_html_custom(html, &tag_factory, commonmark)
}
fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
escape_markdown_base(&result.data, text)
}
pub fn walk(
input: &Handle,
result: &mut StructuredPrinter,
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
commonmark: bool,
url: &Option<Arc<Url>>,
ignore_parents: bool,
) {
let mut handler: Box<dyn TagHandler> = Box::new(DummyHandler);
let mut tag_name = String::default();
let mut inside_pre = false;
let mut inside_code = false;
let mut ignore_write = false;
let mut inside_table = false;
let find_parent_tags = matches!(
&input.data,
NodeData::Element { .. } | NodeData::Text { .. }
);
if find_parent_tags || ignore_parents {
for tag in result.parent_chain.iter() {
if ignore_parents && tag == "table" {
inside_table = true;
break;
}
if tag == "code" {
inside_code = true;
break;
}
if tag == "pre" {
inside_pre = true;
break;
}
if tag_name == "script" || tag_name == "style" {
ignore_write = true;
break;
}
}
}
match input.data {
NodeData::Document
| NodeData::Comment { .. }
| NodeData::Doctype { .. }
| NodeData::ProcessingInstruction { .. } => (),
NodeData::Text { ref contents } => {
let mut text = contents.borrow().to_string();
if inside_pre {
result.append_str(&text);
} else if !(text.trim().is_empty()
&& (result.data.ends_with('\n') || result.data.ends_with(' ')))
&& !ignore_write
{
if !inside_code {
text = escape_markdown(result, &text);
}
let minified_text = EXCESSIVE_WHITESPACE_PATTERN.replace_all(&text, " ");
result.append_str(&minified_text);
} else {
result.append_str(&text);
}
}
NodeData::Element { ref name, .. } => {
if !utils::inline_elements::SKIP_ELEMENTS.contains(&name.local) {
tag_name = name.local.to_string();
if tag_name == "script" || tag_name == "style" {
return;
}
if ignore_parents && tag_name == "table" {
inside_table = true;
}
handler = if inside_pre {
Box::new(DummyHandler)
} else {
get_handler(custom, &tag_name, commonmark, url)
}
}
}
}
if !inside_table || ignore_parents && inside_table {
handler.handle(input, result);
}
result.parent_chain.push(tag_name.clone());
let current_depth = result.parent_chain.len();
result.siblings.insert(current_depth, vec![]);
if !handler.skip_descendants() {
for child in input.children.borrow().iter() {
if valid_block_element(&child.data) {
walk(&child, result, custom, commonmark, url, ignore_parents);
if let NodeData::Element { ref name, .. } = child.data {
if let Some(el) = result.siblings.get_mut(¤t_depth) {
el.push(name.local.to_string());
}
}
}
}
}
result.siblings.remove(¤t_depth);
result.parent_chain.pop();
handler.after_handle(result);
}
fn escape_markdown_base(result: &str, text: &str) -> String {
let data: std::borrow::Cow<str> = crate::MARKDOWN_MIDDLE_KEYCHARS.replace_all(text, "\\$0");
let data = if START_OF_LINE_PATTERN.is_match(&result) {
MARKDOWN_STARTONLY_KEYCHARS.replace(&data, "$1\\$2")
} else {
data
};
data.into()
}
pub(crate) fn get_handler<T: std::borrow::Borrow<str> + std::hash::Hash + std::cmp::Eq>(
custom: &HashMap<String, Box<dyn TagHandlerFactory>>,
tag_name: &T,
commonmark: bool,
url: &Option<Arc<Url>>,
) -> Box<dyn TagHandler> {
let name = tag_name.borrow();
match custom.get(name) {
Some(factory) => {
factory.instantiate()
}
_ => {
match name.as_ref() {
"div" | "section" | "header" | "footer" => Box::new(ContainerHandler),
"p" | "br" | "hr" => Box::new(ParagraphHandler::default()),
"q" | "cite" | "blockquote" => Box::new(QuoteHandler::default()),
"details" | "summary" => Box::new(HtmlCherryPickHandler::new(commonmark)),
"b" | "i" | "s" | "strong" | "em" | "del" => Box::new(StyleHandler::default()),
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => Box::new(HeaderHandler::default()),
"pre" | "code" => Box::new(CodeHandler::default()),
"img" => Box::new(ImgHandler::new(commonmark, url)),
"a" => Box::new(AnchorHandler::new(url)),
"ol" | "ul" | "menu" => Box::new(ListHandler),
"li" => Box::new(ListItemHandler::default()),
"sub" | "sup" => Box::new(IdentityHandler::new(commonmark)),
"table" => Box::new(TableHandler::new(commonmark, url.clone())),
"iframe" => Box::new(IframeHandler),
_ => Box::new(DummyHandler),
}
}
}
}
pub(crate) fn valid_block_element(node: &NodeData) -> bool {
match node {
NodeData::Element { ref name, .. } => {
!utils::inline_elements::SKIP_ELEMENTS.contains(&name.local)
}
_ => true,
}
}
#[derive(Debug, Default)]
pub struct StructuredPrinter {
pub parent_chain: Vec<String>,
pub siblings: HashMap<usize, Vec<String>>,
pub data: String,
}
impl StructuredPrinter {
pub fn insert_newline(&mut self) {
self.append_str("\n");
}
pub fn append_str(&mut self, it: &str) {
self.data.push_str(it);
}
pub fn insert_str(&mut self, pos: usize, it: &str) {
self.data.insert_str(pos, it);
}
}
pub trait TagHandlerFactory {
fn instantiate(&self) -> Box<dyn TagHandler>;
}
pub trait TagHandler {
fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter);
fn after_handle(&mut self, printer: &mut StructuredPrinter);
fn skip_descendants(&self) -> bool {
false
}
}