docbox_core/processing/
html_to_text.rs

1use tl::{Node, Parser};
2
3/// Tags that are considered blocks and should have a newline appended
4const BLOCK_TAGS: &[&str] = &[
5    "div",
6    "p",
7    "section",
8    "article",
9    "header",
10    "footer",
11    "h1",
12    "h2",
13    "h3",
14    "h4",
15    "h5",
16    "h6",
17    "ul",
18    "ol",
19    "li",
20    "pre",
21    "blockquote",
22    "table",
23    "tr",
24    "td",
25    "th",
26    "br",
27];
28
29/// Convert the provided `html` into a text representation maintaining
30/// the newlines that would be produced by block elements
31pub fn html_to_text(html: &str) -> anyhow::Result<String> {
32    let dom = tl::parse(html, tl::ParserOptions::default())?;
33
34    let parser = dom.parser();
35
36    let mut output = String::new();
37
38    for child in dom.children() {
39        let node = match child.get(parser) {
40            Some(value) => value,
41            None => continue,
42        };
43        extract_text(parser, node, &mut output);
44    }
45
46    let decoded = html_escape::decode_html_entities(&output);
47    Ok(decoded.to_string())
48}
49
50fn extract_text<'doc>(parser: &Parser<'doc>, node: &Node<'doc>, out: &mut String) {
51    match node {
52        Node::Raw(text) => {
53            out.push_str(text.as_utf8_str().as_ref());
54        }
55        Node::Tag(tag) => {
56            let tag_name = tag.name().as_utf8_str();
57            let is_block = BLOCK_TAGS.contains(&tag_name.as_ref());
58
59            let children = tag.children();
60            let children = children.top();
61
62            for child in children.as_slice() {
63                let child = match child.get(parser) {
64                    Some(value) => value,
65                    None => continue,
66                };
67                extract_text(parser, child, out);
68            }
69
70            if is_block {
71                out.push('\n');
72            }
73        }
74        _ => {}
75    }
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81
82    #[test]
83    fn test_html_entities_named() {
84        let html = "<p>Tom &amp; Jerry &lt;3 &quot;quotes&quot; &apos;single&apos;</p>";
85        let text = html_to_text(html).unwrap();
86        assert_eq!(text, "Tom & Jerry <3 \"quotes\" 'single'\n");
87    }
88
89    #[test]
90    fn test_html_entities_numeric_decimal() {
91        let html = "<p>Smile &#128512; &#169; &#174;</p>";
92        let text = html_to_text(html).unwrap();
93        assert_eq!(text, "Smile 😀 © ®\n");
94    }
95
96    #[test]
97    fn test_html_entities_numeric_hex() {
98        let html = "<p>Heart &#x2764; &#x1F600;</p>";
99        let text = html_to_text(html).unwrap();
100        assert_eq!(text, "Heart ❤ 😀\n");
101    }
102
103    #[test]
104    fn test_mixed_html_entities() {
105        let html = "<p>Mix &amp; match &#38; &#x26; &lt; &#60;</p>";
106        let text = html_to_text(html).unwrap();
107        assert_eq!(text, "Mix & match & & < <\n");
108    }
109
110    #[test]
111    fn test_html_entities_in_nested_tags() {
112        let html = "<div>Price: &dollar;100 <span>Tax: &#37;10</span></div>";
113        let text = html_to_text(html).unwrap();
114        assert_eq!(text, "Price: $100 Tax: %10\n");
115    }
116
117    #[test]
118    fn test_simple_paragraph() {
119        let html = "<p>Hello, <strong>world</strong>!</p>";
120        let text = html_to_text(html).unwrap();
121        assert_eq!(text, "Hello, world!\n");
122    }
123
124    #[test]
125    fn test_simple_paragraph_with_br() {
126        let html = "<p>Hello, <strong>world</strong>!</p><br>";
127        let text = html_to_text(html).unwrap();
128        assert_eq!(text, "Hello, world!\n\n");
129    }
130
131    #[test]
132    fn test_multiple_block_elements() {
133        let html = "<h1>Title</h1><p>Paragraph 1.</p><p>Paragraph 2.</p>";
134        let text = html_to_text(html).unwrap();
135        assert_eq!(text, "Title\nParagraph 1.\nParagraph 2.\n");
136    }
137
138    #[test]
139    fn test_nested_blocks() {
140        let html = "<div><h1>Header</h1><p>Paragraph <em>with</em> emphasis.</p></div>";
141        let text = html_to_text(html).unwrap();
142        assert_eq!(text, "Header\nParagraph with emphasis.\n\n");
143    }
144
145    #[test]
146    fn test_list_items() {
147        let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
148        let text = html_to_text(html).unwrap();
149        assert_eq!(text, "Item 1\nItem 2\n\n");
150    }
151
152    #[test]
153    fn test_mixed_inline_and_block() {
154        let html = "<div>Block <span>inline</span> text.</div>";
155        let text = html_to_text(html).unwrap();
156        assert_eq!(text, "Block inline text.\n");
157    }
158
159    #[test]
160    fn test_table_with_tr_td() {
161        let html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table>";
162        let text = html_to_text(html).unwrap();
163        assert_eq!(text, "Cell 1\nCell 2\n\nCell 3\nCell 4\n\n\n");
164    }
165
166    #[test]
167    fn test_header_footer() {
168        let html = "<header>Header content</header><footer>Footer content</footer>";
169        let text = html_to_text(html).unwrap();
170        assert_eq!(text, "Header content\nFooter content\n");
171    }
172
173    #[test]
174    fn test_blockquote() {
175        let html = "<blockquote>This is a quote.</blockquote>";
176        let text = html_to_text(html).unwrap();
177        assert_eq!(text, "This is a quote.\n");
178    }
179
180    #[test]
181    fn test_preformatted_text() {
182        let html = "<pre>Code\nblock</pre>";
183        let text = html_to_text(html).unwrap();
184        assert_eq!(text, "Code\nblock\n");
185    }
186
187    #[test]
188    fn test_heading_levels() {
189        let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
190        let text = html_to_text(html).unwrap();
191        assert_eq!(text, "H1\nH2\nH3\nH4\nH5\nH6\n");
192    }
193
194    #[test]
195    fn test_section_article() {
196        let html = "<section>Section content</section><article>Article content</article>";
197        let text = html_to_text(html).unwrap();
198        assert_eq!(text, "Section content\nArticle content\n");
199    }
200
201    #[test]
202    fn test_empty_tags() {
203        let html = "<p></p><div></div><section></section>";
204        let text = html_to_text(html).unwrap();
205        assert_eq!(text, "\n\n\n");
206    }
207
208    #[test]
209    fn test_nested_blocks_with_inline() {
210        let html = "<div><p>Paragraph with <strong>bold</strong> text.</p><footer>Footer <em>content</em>.</footer></div>";
211        let text = html_to_text(html).unwrap();
212        assert_eq!(text, "Paragraph with bold text.\nFooter content.\n\n");
213    }
214
215    #[test]
216    fn test_br_inside_paragraph() {
217        let html = "<p>Line 1<br>Line 2<br>Line 3</p>";
218        let text = html_to_text(html).unwrap();
219        assert_eq!(text, "Line 1\nLine 2\nLine 3\n");
220    }
221}