Skip to main content

ferrum_email_render/
text_extractor.rs

1//! Plain text extraction from a Node tree.
2//!
3//! Walks the component tree and extracts readable plain text,
4//! stripping all HTML markup and converting structural elements
5//! into text-friendly formatting.
6
7use ferrum_email_core::{Node, Tag};
8
9/// Extract plain text from a Node tree.
10///
11/// Converts the node tree into a human-readable plain text representation:
12/// - Text nodes are included as-is
13/// - Block-level elements (p, h1-h6, div, tr) get newlines around them
14/// - Links become `text (url)`
15/// - Horizontal rules become `---`
16/// - Images become `[alt text]`
17/// - Fragments and other nodes are recursively processed
18pub fn extract_text(node: &Node) -> String {
19    let mut output = String::new();
20    extract_node(node, &mut output);
21    // Clean up excessive whitespace
22    clean_text(&output)
23}
24
25fn extract_node(node: &Node, output: &mut String) {
26    match node {
27        Node::Text(text) => {
28            output.push_str(text);
29        }
30        Node::Element(element) => {
31            let tag = &element.tag;
32
33            // Handle special elements
34            match tag {
35                Tag::A => {
36                    // Extract link text and href
37                    let link_text = extract_children_text(&element.children);
38                    let href = element
39                        .attrs
40                        .iter()
41                        .find(|a| a.name == "href")
42                        .map(|a| a.value.as_str())
43                        .unwrap_or("");
44
45                    if !link_text.is_empty() && !href.is_empty() && link_text != href {
46                        output.push_str(&link_text);
47                        output.push_str(" (");
48                        output.push_str(href);
49                        output.push(')');
50                    } else if !link_text.is_empty() {
51                        output.push_str(&link_text);
52                    } else if !href.is_empty() {
53                        output.push_str(href);
54                    }
55                    return;
56                }
57                Tag::Img => {
58                    let alt = element
59                        .attrs
60                        .iter()
61                        .find(|a| a.name == "alt")
62                        .map(|a| a.value.as_str())
63                        .unwrap_or("");
64                    if !alt.is_empty() {
65                        output.push('[');
66                        output.push_str(alt);
67                        output.push(']');
68                    }
69                    return;
70                }
71                Tag::Hr => {
72                    output.push_str("\n---\n");
73                    return;
74                }
75                Tag::Br => {
76                    output.push('\n');
77                    return;
78                }
79                Tag::Head | Tag::Meta | Tag::Title => {
80                    // Skip head content in plain text
81                    return;
82                }
83                _ => {}
84            }
85
86            // Check if preview text (hidden div) — skip it
87            if is_hidden_element(element) {
88                return;
89            }
90
91            let is_block = is_block_element(tag);
92
93            if is_block {
94                output.push('\n');
95            }
96
97            for child in &element.children {
98                extract_node(child, output);
99            }
100
101            if is_block {
102                output.push('\n');
103            }
104        }
105        Node::Fragment(nodes) => {
106            for node in nodes {
107                extract_node(node, output);
108            }
109        }
110        Node::None => {}
111    }
112}
113
114fn extract_children_text(children: &[Node]) -> String {
115    let mut output = String::new();
116    for child in children {
117        extract_node(child, &mut output);
118    }
119    output.trim().to_string()
120}
121
122fn is_block_element(tag: &Tag) -> bool {
123    matches!(
124        tag,
125        Tag::P
126            | Tag::Div
127            | Tag::H1
128            | Tag::H2
129            | Tag::H3
130            | Tag::H4
131            | Tag::H5
132            | Tag::H6
133            | Tag::Tr
134            | Tag::Table
135            | Tag::Pre
136    )
137}
138
139fn is_hidden_element(element: &ferrum_email_core::Element) -> bool {
140    // Check for display:none in style
141    if let Some(ref display) = element.style.display
142        && *display == ferrum_email_core::Display::None
143    {
144        return true;
145    }
146    // Check for style attribute containing display:none
147    element
148        .attrs
149        .iter()
150        .any(|a| a.name == "style" && a.value.contains("display:none"))
151}
152
153/// Clean up excessive whitespace in extracted text.
154fn clean_text(input: &str) -> String {
155    let mut lines: Vec<&str> = input.lines().collect();
156
157    // Trim each line
158    let lines: Vec<&str> = lines.iter_mut().map(|l| l.trim()).collect();
159
160    // Remove excessive blank lines (more than 2 consecutive)
161    let mut result = String::new();
162    let mut blank_count = 0;
163
164    for line in &lines {
165        if line.is_empty() {
166            blank_count += 1;
167            if blank_count <= 2 {
168                result.push('\n');
169            }
170        } else {
171            blank_count = 0;
172            if !result.is_empty() && !result.ends_with('\n') {
173                result.push('\n');
174            }
175            result.push_str(line);
176            result.push('\n');
177        }
178    }
179
180    result.trim().to_string()
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use ferrum_email_core::{Element, Node, Tag};
187
188    #[test]
189    fn test_extract_text_from_text_node() {
190        let node = Node::text("Hello, World!");
191        assert_eq!(extract_text(&node), "Hello, World!");
192    }
193
194    #[test]
195    fn test_extract_text_from_link() {
196        let node = Node::Element(
197            Element::new(Tag::A)
198                .attr("href", "https://example.com")
199                .child(Node::text("Click here")),
200        );
201        assert_eq!(extract_text(&node), "Click here (https://example.com)");
202    }
203
204    #[test]
205    fn test_extract_text_from_hr() {
206        let node = Node::Element(Element::new(Tag::Hr));
207        assert_eq!(extract_text(&node), "---");
208    }
209
210    #[test]
211    fn test_extract_text_from_image() {
212        let node = Node::Element(
213            Element::new(Tag::Img)
214                .attr("alt", "Logo")
215                .attr("src", "logo.png"),
216        );
217        assert_eq!(extract_text(&node), "[Logo]");
218    }
219}