docbox_core/processing/
html_to_text.rs1use tl::{Node, Parser};
2
3const BLOCK_TAGS: &[&str] = &[
5 "div",
6 "p",
7 "section",
8 "article",
9 "header",
10 "footer",
11 "h1",
12 "h2",
13 "h3",
14 "h4",
15 "h5",
16 "h6",
17 "ul",
18 "ol",
19 "li",
20 "pre",
21 "blockquote",
22 "table",
23 "tr",
24 "td",
25 "th",
26 "br",
27];
28
29pub fn html_to_text(html: &str) -> anyhow::Result<String> {
32 let dom = tl::parse(html, tl::ParserOptions::default())?;
33
34 let parser = dom.parser();
35
36 let mut output = String::new();
37
38 for child in dom.children() {
39 let node = match child.get(parser) {
40 Some(value) => value,
41 None => continue,
42 };
43 extract_text(parser, node, &mut output);
44 }
45
46 let decoded = html_escape::decode_html_entities(&output);
47 Ok(decoded.to_string())
48}
49
50fn extract_text<'doc>(parser: &Parser<'doc>, node: &Node<'doc>, out: &mut String) {
51 match node {
52 Node::Raw(text) => {
53 out.push_str(text.as_utf8_str().as_ref());
54 }
55 Node::Tag(tag) => {
56 let tag_name = tag.name().as_utf8_str();
57 let is_block = BLOCK_TAGS.contains(&tag_name.as_ref());
58
59 let children = tag.children();
60 let children = children.top();
61
62 for child in children.as_slice() {
63 let child = match child.get(parser) {
64 Some(value) => value,
65 None => continue,
66 };
67 extract_text(parser, child, out);
68 }
69
70 if is_block {
71 out.push('\n');
72 }
73 }
74 _ => {}
75 }
76}
77
78#[cfg(test)]
79mod tests {
80 use super::*;
81
82 #[test]
83 fn test_html_entities_named() {
84 let html = "<p>Tom & Jerry <3 "quotes" 'single'</p>";
85 let text = html_to_text(html).unwrap();
86 assert_eq!(text, "Tom & Jerry <3 \"quotes\" 'single'\n");
87 }
88
89 #[test]
90 fn test_html_entities_numeric_decimal() {
91 let html = "<p>Smile 😀 © ®</p>";
92 let text = html_to_text(html).unwrap();
93 assert_eq!(text, "Smile 😀 © ®\n");
94 }
95
96 #[test]
97 fn test_html_entities_numeric_hex() {
98 let html = "<p>Heart ❤ 😀</p>";
99 let text = html_to_text(html).unwrap();
100 assert_eq!(text, "Heart ❤ 😀\n");
101 }
102
103 #[test]
104 fn test_mixed_html_entities() {
105 let html = "<p>Mix & match & & < <</p>";
106 let text = html_to_text(html).unwrap();
107 assert_eq!(text, "Mix & match & & < <\n");
108 }
109
110 #[test]
111 fn test_html_entities_in_nested_tags() {
112 let html = "<div>Price: $100 <span>Tax: %10</span></div>";
113 let text = html_to_text(html).unwrap();
114 assert_eq!(text, "Price: $100 Tax: %10\n");
115 }
116
117 #[test]
118 fn test_simple_paragraph() {
119 let html = "<p>Hello, <strong>world</strong>!</p>";
120 let text = html_to_text(html).unwrap();
121 assert_eq!(text, "Hello, world!\n");
122 }
123
124 #[test]
125 fn test_simple_paragraph_with_br() {
126 let html = "<p>Hello, <strong>world</strong>!</p><br>";
127 let text = html_to_text(html).unwrap();
128 assert_eq!(text, "Hello, world!\n\n");
129 }
130
131 #[test]
132 fn test_multiple_block_elements() {
133 let html = "<h1>Title</h1><p>Paragraph 1.</p><p>Paragraph 2.</p>";
134 let text = html_to_text(html).unwrap();
135 assert_eq!(text, "Title\nParagraph 1.\nParagraph 2.\n");
136 }
137
138 #[test]
139 fn test_nested_blocks() {
140 let html = "<div><h1>Header</h1><p>Paragraph <em>with</em> emphasis.</p></div>";
141 let text = html_to_text(html).unwrap();
142 assert_eq!(text, "Header\nParagraph with emphasis.\n\n");
143 }
144
145 #[test]
146 fn test_list_items() {
147 let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
148 let text = html_to_text(html).unwrap();
149 assert_eq!(text, "Item 1\nItem 2\n\n");
150 }
151
152 #[test]
153 fn test_mixed_inline_and_block() {
154 let html = "<div>Block <span>inline</span> text.</div>";
155 let text = html_to_text(html).unwrap();
156 assert_eq!(text, "Block inline text.\n");
157 }
158
159 #[test]
160 fn test_table_with_tr_td() {
161 let html = "<table><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table>";
162 let text = html_to_text(html).unwrap();
163 assert_eq!(text, "Cell 1\nCell 2\n\nCell 3\nCell 4\n\n\n");
164 }
165
166 #[test]
167 fn test_header_footer() {
168 let html = "<header>Header content</header><footer>Footer content</footer>";
169 let text = html_to_text(html).unwrap();
170 assert_eq!(text, "Header content\nFooter content\n");
171 }
172
173 #[test]
174 fn test_blockquote() {
175 let html = "<blockquote>This is a quote.</blockquote>";
176 let text = html_to_text(html).unwrap();
177 assert_eq!(text, "This is a quote.\n");
178 }
179
180 #[test]
181 fn test_preformatted_text() {
182 let html = "<pre>Code\nblock</pre>";
183 let text = html_to_text(html).unwrap();
184 assert_eq!(text, "Code\nblock\n");
185 }
186
187 #[test]
188 fn test_heading_levels() {
189 let html = "<h1>H1</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>";
190 let text = html_to_text(html).unwrap();
191 assert_eq!(text, "H1\nH2\nH3\nH4\nH5\nH6\n");
192 }
193
194 #[test]
195 fn test_section_article() {
196 let html = "<section>Section content</section><article>Article content</article>";
197 let text = html_to_text(html).unwrap();
198 assert_eq!(text, "Section content\nArticle content\n");
199 }
200
201 #[test]
202 fn test_empty_tags() {
203 let html = "<p></p><div></div><section></section>";
204 let text = html_to_text(html).unwrap();
205 assert_eq!(text, "\n\n\n");
206 }
207
208 #[test]
209 fn test_nested_blocks_with_inline() {
210 let html = "<div><p>Paragraph with <strong>bold</strong> text.</p><footer>Footer <em>content</em>.</footer></div>";
211 let text = html_to_text(html).unwrap();
212 assert_eq!(text, "Paragraph with bold text.\nFooter content.\n\n");
213 }
214
215 #[test]
216 fn test_br_inside_paragraph() {
217 let html = "<p>Line 1<br>Line 2<br>Line 3</p>";
218 let text = html_to_text(html).unwrap();
219 assert_eq!(text, "Line 1\nLine 2\nLine 3\n");
220 }
221}