mod dom_walker;
pub mod element_handler;
pub(crate) mod node_util;
pub mod options;
pub(crate) mod text_util;
use std::rc::Rc;
use dom_walker::walk_node;
use element_handler::{ElementHandler, ElementHandlers};
use html5ever::tendril::TendrilSink;
use html5ever::tree_builder::TreeBuilderOpts;
use html5ever::{parse_document, Attribute, ParseOpts};
use markup5ever_rcdom::{Node, RcDom};
use options::Options;
pub fn convert(html: &str) -> Result<String, std::io::Error> {
HtmlToMarkdown::new().convert(html)
}
pub struct Element<'a> {
pub node: &'a Rc<Node>,
pub tag: &'a str,
pub attrs: &'a [Attribute],
pub content: &'a str,
pub options: &'a Options,
}
pub struct HtmlToMarkdown {
options: Options,
handlers: ElementHandlers,
scripting_enabled: bool,
}
impl Default for HtmlToMarkdown {
fn default() -> Self {
Self::new()
}
}
impl HtmlToMarkdown {
pub fn new() -> Self {
Self {
options: Options::default(),
handlers: ElementHandlers::new(),
scripting_enabled: true,
}
}
pub(crate) fn from_params(
options: Options,
handlers: ElementHandlers,
scripting_enabled: bool,
) -> Self {
Self {
options,
handlers,
scripting_enabled,
}
}
pub fn builder() -> HtmlToMarkdownBuilder {
HtmlToMarkdownBuilder::new()
}
pub fn convert(&self, html: &str) -> std::io::Result<String> {
let dom = parse_document(
RcDom::default(),
ParseOpts {
tree_builder: TreeBuilderOpts {
scripting_enabled: self.scripting_enabled,
..Default::default()
},
..Default::default()
},
)
.from_utf8()
.read_from(&mut html.as_bytes())?;
let mut buffer: Vec<String> = Vec::new();
walk_node(
&dom.document,
None,
&mut buffer,
&self.handlers,
&self.options,
false,
true,
);
let mut content = buffer.join("").trim_matches(|ch| ch == '\n').to_string();
let mut append = String::new();
for rule in &self.handlers.rules {
let Some(append_content) = rule.handler.append() else {
continue;
};
append.push_str(&append_content);
}
content.push_str(append.trim_end_matches('\n'));
Ok(content)
}
}
pub struct HtmlToMarkdownBuilder {
options: Options,
handlers: ElementHandlers,
scripting_enabled: bool,
}
impl Default for HtmlToMarkdownBuilder {
fn default() -> Self {
Self::new()
}
}
impl HtmlToMarkdownBuilder {
pub fn new() -> Self {
Self {
options: Options::default(),
handlers: ElementHandlers::new(),
scripting_enabled: true,
}
}
pub fn options(mut self, options: Options) -> Self {
self.options = options;
self
}
pub fn skip_tags(self, tags: Vec<&str>) -> Self {
self.add_handler(tags, |_: Element| None)
}
pub fn add_handler<Handler>(mut self, tags: Vec<&str>, handler: Handler) -> Self
where
Handler: ElementHandler + 'static,
{
self.handlers.add_handler(tags, handler);
self
}
pub fn scripting_enabled(mut self, enabled: bool) -> Self {
self.scripting_enabled = enabled;
self
}
pub fn build(self) -> HtmlToMarkdown {
HtmlToMarkdown::from_params(self.options, self.handlers, self.scripting_enabled)
}
}