1mod dom_walker;
2pub mod element_handler;
3pub(crate) mod node_util;
4pub mod options;
5pub(crate) mod text_util;
6
7use std::rc::Rc;
8
9use dom_walker::walk_node;
10use element_handler::{ElementHandler, ElementHandlers};
11use html5ever::tendril::TendrilSink;
12use html5ever::{parse_document, Attribute};
13use markup5ever_rcdom::{Node, RcDom};
14use options::Options;
15
16pub fn convert(html: &str) -> Result<String, std::io::Error> {
27 HtmlToMarkdown::new().convert(html)
28}
29
30pub struct Element<'a> {
32 pub node: &'a Rc<Node>,
34 pub tag: &'a str,
36 pub attrs: &'a [Attribute],
38 pub content: &'a str,
40 pub options: &'a Options,
42}
43
44pub struct HtmlToMarkdown {
63 options: Options,
64 handlers: ElementHandlers,
65}
66
67impl Default for HtmlToMarkdown {
68 fn default() -> Self {
69 Self::new()
70 }
71}
72
73impl HtmlToMarkdown {
74 pub fn new() -> Self {
76 Self {
77 options: Options::default(),
78 handlers: ElementHandlers::new(),
79 }
80 }
81
82 pub(crate) fn from_params(options: Options, handlers: ElementHandlers) -> Self {
83 Self { options, handlers }
84 }
85
86 pub fn builder() -> HtmlToMarkdownBuilder {
88 HtmlToMarkdownBuilder::new()
89 }
90
91 pub fn convert(&self, html: &str) -> std::io::Result<String> {
93 let dom = parse_document(RcDom::default(), Default::default())
94 .from_utf8()
95 .read_from(&mut html.as_bytes())?;
96
97 let mut buffer: Vec<String> = Vec::new();
98
99 walk_node(
100 &dom.document,
101 None,
102 &mut buffer,
103 &self.handlers,
104 &self.options,
105 false,
106 true,
107 );
108
109 let mut content = buffer.join("").trim_matches(|ch| ch == '\n').to_string();
110
111 let mut append = String::new();
112 for rule in &self.handlers.rules {
113 let Some(append_content) = rule.handler.append() else {
114 continue;
115 };
116 append.push_str(&append_content);
117 }
118
119 content.push_str(append.trim_end_matches('\n'));
120
121 Ok(content)
122 }
123}
124
125pub struct HtmlToMarkdownBuilder {
127 options: Options,
128 handlers: ElementHandlers,
129}
130
131impl Default for HtmlToMarkdownBuilder {
132 fn default() -> Self {
133 Self::new()
134 }
135}
136
137impl HtmlToMarkdownBuilder {
138 pub fn new() -> Self {
140 Self {
141 options: Options::default(),
142 handlers: ElementHandlers::new(),
143 }
144 }
145
146 pub fn options(mut self, options: Options) -> Self {
148 self.options = options;
149 self
150 }
151
152 pub fn skip_tags(self, tags: Vec<&str>) -> Self {
154 self.add_handler(tags, |_: Element| None)
155 }
156
157 pub fn add_handler<Handler>(mut self, tags: Vec<&str>, handler: Handler) -> Self
175 where
176 Handler: ElementHandler + 'static,
177 {
178 self.handlers.add_handler(tags, handler);
179 self
180 }
181
182 pub fn build(self) -> HtmlToMarkdown {
184 HtmlToMarkdown::from_params(self.options, self.handlers)
185 }
186}