htmd/
lib.rs

1mod dom_walker;
2pub mod element_handler;
3pub(crate) mod node_util;
4pub mod options;
5pub(crate) mod text_util;
6
7use std::rc::Rc;
8
9use dom_walker::walk_node;
10use element_handler::{ElementHandler, ElementHandlers};
11use html5ever::tendril::TendrilSink;
12use html5ever::{parse_document, Attribute};
13use markup5ever_rcdom::{Node, RcDom};
14use options::Options;
15
16/// Convert HTML to Markdown.
17///
18/// Example:
19///
20/// ```
21/// use htmd::convert;
22///
23/// let md = convert("<h1>Hello</h1>").unwrap();
24/// assert_eq!("# Hello", md);
25/// ```
26pub fn convert(html: &str) -> Result<String, std::io::Error> {
27    HtmlToMarkdown::new().convert(html)
28}
29
30/// The DOM element.
31pub struct Element<'a> {
32    /// The html5ever node of the element.
33    pub node: &'a Rc<Node>,
34    /// The tag name.
35    pub tag: &'a str,
36    /// The attribute list.
37    pub attrs: &'a [Attribute],
38    /// The content text, can be raw text or converted Markdown text.
39    pub content: &'a str,
40    /// Converter options.
41    pub options: &'a Options,
42}
43
44/// The html-to-markdown converter.
45///
46/// # Example
47/// ```
48/// use htmd::{Element, HtmlToMarkdown};
49///
50/// // One-liner
51/// let md = HtmlToMarkdown::new().convert("<h1>Hello</h1>").unwrap();
52/// assert_eq!("# Hello", md);
53///
54/// // Or use the builder pattern
55/// let converter = HtmlToMarkdown::builder()
56///     .skip_tags(vec!["img"])
57///     .build();
58/// let md = converter.convert("<img src=\"https://example.com\">").unwrap();
59/// // img is ignored
60/// assert_eq!("", md);
61/// ```
62pub struct HtmlToMarkdown {
63    options: Options,
64    handlers: ElementHandlers,
65}
66
67impl Default for HtmlToMarkdown {
68    fn default() -> Self {
69        Self::new()
70    }
71}
72
73impl HtmlToMarkdown {
74    /// Create a new converter.
75    pub fn new() -> Self {
76        Self {
77            options: Options::default(),
78            handlers: ElementHandlers::new(),
79        }
80    }
81
82    pub(crate) fn from_params(options: Options, handlers: ElementHandlers) -> Self {
83        Self { options, handlers }
84    }
85
86    /// Create a new [HtmlToMarkdownBuilder].
87    pub fn builder() -> HtmlToMarkdownBuilder {
88        HtmlToMarkdownBuilder::new()
89    }
90
91    /// Convert HTML to Markdown.
92    pub fn convert(&self, html: &str) -> std::io::Result<String> {
93        let dom = parse_document(RcDom::default(), Default::default())
94            .from_utf8()
95            .read_from(&mut html.as_bytes())?;
96
97        let mut buffer: Vec<String> = Vec::new();
98
99        walk_node(
100            &dom.document,
101            None,
102            &mut buffer,
103            &self.handlers,
104            &self.options,
105            false,
106            true,
107        );
108
109        let mut content = buffer.join("").trim_matches(|ch| ch == '\n').to_string();
110
111        let mut append = String::new();
112        for rule in &self.handlers.rules {
113            let Some(append_content) = rule.handler.append() else {
114                continue;
115            };
116            append.push_str(&append_content);
117        }
118
119        content.push_str(append.trim_end_matches('\n'));
120
121        Ok(content)
122    }
123}
124
125/// The [HtmlToMarkdown] builder for advanced configurations.
126pub struct HtmlToMarkdownBuilder {
127    options: Options,
128    handlers: ElementHandlers,
129}
130
131impl Default for HtmlToMarkdownBuilder {
132    fn default() -> Self {
133        Self::new()
134    }
135}
136
137impl HtmlToMarkdownBuilder {
138    /// Create a new builder.
139    pub fn new() -> Self {
140        Self {
141            options: Options::default(),
142            handlers: ElementHandlers::new(),
143        }
144    }
145
146    /// Set converting options.
147    pub fn options(mut self, options: Options) -> Self {
148        self.options = options;
149        self
150    }
151
152    /// Skip a group of tags when converting.
153    pub fn skip_tags(self, tags: Vec<&str>) -> Self {
154        self.add_handler(tags, |_: Element| None)
155    }
156
157    /// Apply a custom element handler for a group of tags.
158    ///
159    /// # Example
160    ///
161    /// ```
162    /// use htmd::{Element, HtmlToMarkdownBuilder};
163    ///
164    /// let mut handlers = HtmlToMarkdownBuilder::new()
165    ///    .add_handler(vec!["img"], |_: Element| {
166    ///        // Skip the img tag when converting.
167    ///        None
168    ///    })
169    ///    .add_handler(vec!["video"], |element: Element| {
170    ///        // Handle the video tag.
171    ///        todo!("Return some text to represent this video element.")
172    ///    });
173    /// ```
174    pub fn add_handler<Handler>(mut self, tags: Vec<&str>, handler: Handler) -> Self
175    where
176        Handler: ElementHandler + 'static,
177    {
178        self.handlers.add_handler(tags, handler);
179        self
180    }
181
182    /// Create a new [HtmlToMarkdown].
183    pub fn build(self) -> HtmlToMarkdown {
184        HtmlToMarkdown::from_params(self.options, self.handlers)
185    }
186}