mod dom_walker;
pub mod element_handler;
mod html_escape;
pub(crate) mod node_util;
pub mod options;
pub(crate) mod text_util;
use std::rc::Rc;
use dom_walker::walk_node;
use element_handler::{ElementHandler, ElementHandlers};
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use html5ever::{Attribute, ParseOpts, parse_document};
pub use markup5ever_rcdom::Node;
use markup5ever_rcdom::RcDom;
use options::Options;
use crate::element_handler::Handlers;
pub fn convert(html: &str) -> Result<String, std::io::Error> {
HtmlToMarkdown::new().convert(html)
}
pub struct Element<'a> {
pub node: &'a Rc<Node>,
pub tag: &'a str,
pub attrs: &'a [Attribute],
pub markdown_translated: bool,
pub(crate) skipped_handlers: usize,
}
pub struct HtmlToMarkdown {
handlers: ElementHandlers,
scripting_enabled: bool,
}
impl Default for HtmlToMarkdown {
fn default() -> Self {
Self::new()
}
}
impl HtmlToMarkdown {
pub fn new() -> Self {
let options = Options::default();
let handlers = ElementHandlers::new(options);
Self {
handlers,
scripting_enabled: true,
}
}
pub(crate) fn from_params(handlers: ElementHandlers, scripting_enabled: bool) -> Self {
Self {
handlers,
scripting_enabled,
}
}
pub fn builder() -> HtmlToMarkdownBuilder {
HtmlToMarkdownBuilder::new()
}
pub fn html_to_tree(&self, html: &str) -> std::io::Result<Rc<Node>> {
let dom = parse_document(
RcDom::default(),
ParseOpts {
tree_builder: TreeBuilderOpts {
scripting_enabled: self.scripting_enabled,
..Default::default()
},
..Default::default()
},
)
.from_utf8()
.read_from(&mut html.as_bytes())?;
Ok(dom.document)
}
pub fn tree_to_markdown(&self, tree: &Rc<Node>) -> String {
let mut content = String::new();
walk_node(tree, &mut content, &self.handlers, None, true, false);
let mut content = content.trim_matches(|ch| ch == '\n').to_string();
let mut append = String::new();
for handler in &self.handlers.handlers {
let Some(append_content) = handler.append() else {
continue;
};
append.push_str(&append_content);
}
content.push_str(append.trim_end_matches('\n'));
content
}
pub fn convert(&self, html: &str) -> std::io::Result<String> {
Ok(self.tree_to_markdown(&self.html_to_tree(html)?))
}
}
pub struct HtmlToMarkdownBuilder {
handlers: ElementHandlers,
scripting_enabled: bool,
}
impl Default for HtmlToMarkdownBuilder {
fn default() -> Self {
Self::new()
}
}
impl HtmlToMarkdownBuilder {
pub fn new() -> Self {
let options = Options::default();
let handlers = ElementHandlers::new(options);
Self {
handlers,
scripting_enabled: true,
}
}
pub fn options(mut self, options: Options) -> Self {
self.handlers.options = options;
self
}
pub fn skip_tags(self, tags: Vec<&str>) -> Self {
self.add_handler(tags, |_: &dyn Handlers, _: Element| None)
}
pub fn add_handler<Handler>(mut self, tags: Vec<&str>, handler: Handler) -> Self
where
Handler: ElementHandler + 'static,
{
self.handlers.add_handler(tags, handler);
self
}
pub fn scripting_enabled(mut self, enabled: bool) -> Self {
self.scripting_enabled = enabled;
self
}
pub fn build(self) -> HtmlToMarkdown {
HtmlToMarkdown::from_params(self.handlers, self.scripting_enabled)
}
}