scrape_core/
serialize.rs

1//! HTML serialization utilities.
2//!
3//! This module provides functions for serializing DOM nodes back to HTML
4//! and extracting text content. These functions are used by all bindings
5//! (Python, Node.js, WASM) to implement `inner_html`, `outer_html`, and `text`
6//! properties.
7
8use crate::{
9    Document, NodeId, NodeKind, Tag,
10    utils::{escape_attr, escape_text, is_void_element},
11};
12
13/// Serializes a DOM node and its subtree to HTML.
14///
15/// This function recursively serializes an element, its attributes, and all
16/// descendant nodes to an HTML string. The output is appended to the provided
17/// buffer.
18///
19/// # Serialization Rules
20///
21/// - **Elements**: Serialized as `<name attrs>children</name>` or `<name attrs>` for void elements
22/// - **Text nodes**: Content is HTML-escaped using [`escape_text`]
23/// - **Comments**: Serialized as `<!--content-->`
24/// - **Attributes**: Values are HTML-escaped using [`escape_attr`]
25///
26/// # Examples
27///
28/// ```rust
29/// use scrape_core::{Soup, serialize::serialize_node};
30///
31/// let soup = Soup::parse("<div class=\"test\"><span>Hello</span></div>");
32/// let doc = soup.document();
33/// let div_id = soup.find("div").unwrap().unwrap().node_id();
34///
35/// let mut html = String::new();
36/// serialize_node(doc, div_id, &mut html);
37/// assert!(html.contains("<div"));
38/// assert!(html.contains("</div>"));
39/// ```
40pub fn serialize_node(doc: &Document, id: NodeId, buf: &mut String) {
41    let Some(node) = doc.get(id) else { return };
42
43    match &node.kind {
44        NodeKind::Element { name, attributes, .. } => {
45            buf.push('<');
46            buf.push_str(name);
47
48            for (attr_name, attr_value) in attributes {
49                buf.push(' ');
50                buf.push_str(attr_name);
51                buf.push_str("=\"");
52                buf.push_str(&escape_attr(attr_value));
53                buf.push('"');
54            }
55
56            buf.push('>');
57
58            if !is_void_element(name) {
59                for child_id in doc.children(id) {
60                    serialize_node(doc, child_id, buf);
61                }
62                buf.push_str("</");
63                buf.push_str(name);
64                buf.push('>');
65            }
66        }
67        NodeKind::Text { content } => {
68            buf.push_str(&escape_text(content));
69        }
70        NodeKind::Comment { content } => {
71            buf.push_str("<!--");
72            buf.push_str(content);
73            buf.push_str("-->");
74        }
75    }
76}
77
78/// Serializes only the children of a node to HTML (inner HTML).
79///
80/// This is equivalent to calling [`serialize_node`] on each child and
81/// concatenating the results.
82///
83/// # Examples
84///
85/// ```rust
86/// use scrape_core::{Soup, serialize::serialize_inner_html};
87///
88/// let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
89/// let doc = soup.document();
90/// let div_id = soup.find("div").unwrap().unwrap().node_id();
91///
92/// let mut html = String::new();
93/// serialize_inner_html(doc, div_id, &mut html);
94/// assert_eq!(html, "<span>A</span><span>B</span>");
95/// ```
96pub fn serialize_inner_html(doc: &Document, id: NodeId, buf: &mut String) {
97    for child_id in doc.children(id) {
98        serialize_node(doc, child_id, buf);
99    }
100}
101
102/// Collects text content from a node and its descendants.
103///
104/// This function recursively traverses the DOM subtree and concatenates
105/// all text node content into the provided buffer. Element and comment
106/// nodes are skipped (only their text children are included).
107///
108/// # Examples
109///
110/// ```rust
111/// use scrape_core::{Soup, serialize::collect_text};
112///
113/// let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
114/// let doc = soup.document();
115/// let div_id = soup.find("div").unwrap().unwrap().node_id();
116///
117/// let mut text = String::new();
118/// collect_text(doc, div_id, &mut text);
119/// assert_eq!(text, "Hello World!");
120/// ```
121pub fn collect_text(doc: &Document, id: NodeId, buf: &mut String) {
122    let Some(node) = doc.get(id) else { return };
123
124    match &node.kind {
125        NodeKind::Text { content } => buf.push_str(content),
126        NodeKind::Element { .. } => {
127            for child_id in doc.children(id) {
128                collect_text(doc, child_id, buf);
129            }
130        }
131        NodeKind::Comment { .. } => {}
132    }
133}
134
135/// Trait for types that can be serialized to HTML.
136///
137/// This trait provides a unified interface for HTML serialization operations.
138/// It is implemented for [`Tag`] to enable consistent serialization across
139/// the library and bindings.
140///
141/// # Design Rationale
142///
143/// The trait uses buffer-based methods (`_into` suffix) as the primitive
144/// operations, with convenience methods that allocate and return `String`.
145/// This enables zero-allocation usage in performance-critical paths while
146/// providing ergonomic APIs for common use cases.
147///
148/// # Examples
149///
150/// ```rust
151/// use scrape_core::{Soup, serialize::HtmlSerializer};
152///
153/// let soup = Soup::parse("<div><span>Hello</span></div>");
154/// let div = soup.find("div").unwrap().unwrap();
155///
156/// // Convenience method (allocates)
157/// let html = div.serialize_html();
158/// assert!(html.contains("<span>"));
159///
160/// // Buffer method (no allocation if buffer has capacity)
161/// let mut buf = String::with_capacity(100);
162/// div.serialize_html_into(&mut buf);
163/// assert_eq!(html, buf);
164/// ```
165pub trait HtmlSerializer {
166    /// Serializes this node and its subtree to HTML.
167    ///
168    /// This is the outer HTML including the node's own tags.
169    #[must_use]
170    fn serialize_html(&self) -> String {
171        let mut buf = String::new();
172        self.serialize_html_into(&mut buf);
173        buf
174    }
175
176    /// Serializes this node to HTML, appending to the provided buffer.
177    fn serialize_html_into(&self, buf: &mut String);
178
179    /// Serializes only the children of this node to HTML.
180    ///
181    /// This is the inner HTML excluding the node's own tags.
182    #[must_use]
183    fn serialize_inner(&self) -> String {
184        let mut buf = String::new();
185        self.serialize_inner_into(&mut buf);
186        buf
187    }
188
189    /// Serializes children to HTML, appending to the provided buffer.
190    fn serialize_inner_into(&self, buf: &mut String);
191
192    /// Extracts text content from this node and its descendants.
193    ///
194    /// HTML tags are stripped; only text node content is included.
195    #[must_use]
196    fn extract_text(&self) -> String {
197        let mut buf = String::new();
198        self.extract_text_into(&mut buf);
199        buf
200    }
201
202    /// Extracts text content, appending to the provided buffer.
203    fn extract_text_into(&self, buf: &mut String);
204}
205
206impl HtmlSerializer for Tag<'_> {
207    #[inline]
208    fn serialize_html_into(&self, buf: &mut String) {
209        serialize_node(self.document(), self.node_id(), buf);
210    }
211
212    #[inline]
213    fn serialize_inner_into(&self, buf: &mut String) {
214        serialize_inner_html(self.document(), self.node_id(), buf);
215    }
216
217    #[inline]
218    fn extract_text_into(&self, buf: &mut String) {
219        collect_text(self.document(), self.node_id(), buf);
220    }
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226    use crate::Soup;
227
228    #[test]
229    fn test_serialize_node_element() {
230        let soup = Soup::parse("<div>text</div>");
231        let doc = soup.document();
232        let div = soup.find("div").unwrap().unwrap();
233
234        let mut buf = String::new();
235        serialize_node(doc, div.node_id(), &mut buf);
236        assert_eq!(buf, "<div>text</div>");
237    }
238
239    #[test]
240    fn test_serialize_node_with_attributes() {
241        let soup = Soup::parse("<a href=\"/page\" class=\"link\">click</a>");
242        let doc = soup.document();
243        let a = soup.find("a").unwrap().unwrap();
244
245        let mut buf = String::new();
246        serialize_node(doc, a.node_id(), &mut buf);
247        assert!(buf.contains("href=\"/page\""));
248        assert!(buf.contains("class=\"link\""));
249        assert!(buf.contains(">click</a>"));
250    }
251
252    #[test]
253    fn test_serialize_node_escapes_attr() {
254        let soup = Soup::parse("<div data-value=\"a &amp; b\">text</div>");
255        let doc = soup.document();
256        let div = soup.find("div").unwrap().unwrap();
257
258        let mut buf = String::new();
259        serialize_node(doc, div.node_id(), &mut buf);
260        assert!(buf.contains("data-value="));
261    }
262
263    #[test]
264    fn test_serialize_node_void_element() {
265        let soup = Soup::parse("<div><br><hr></div>");
266        let doc = soup.document();
267        let div = soup.find("div").unwrap().unwrap();
268
269        let mut buf = String::new();
270        serialize_node(doc, div.node_id(), &mut buf);
271        assert!(buf.contains("<br>"));
272        assert!(buf.contains("<hr>"));
273        assert!(!buf.contains("</br>"));
274        assert!(!buf.contains("</hr>"));
275    }
276
277    #[test]
278    fn test_serialize_node_nested() {
279        let soup = Soup::parse("<div><span><b>deep</b></span></div>");
280        let doc = soup.document();
281        let div = soup.find("div").unwrap().unwrap();
282
283        let mut buf = String::new();
284        serialize_node(doc, div.node_id(), &mut buf);
285        assert_eq!(buf, "<div><span><b>deep</b></span></div>");
286    }
287
288    #[test]
289    fn test_serialize_node_comment() {
290        use crate::SoupConfig;
291
292        let config = SoupConfig { include_comments: true, ..Default::default() };
293        let soup = Soup::parse_with_config("<div>text<!-- comment -->more</div>", config);
294        let doc = soup.document();
295        let div = soup.find("div").unwrap().unwrap();
296
297        let mut buf = String::new();
298        serialize_node(doc, div.node_id(), &mut buf);
299        assert!(buf.contains("<!-- comment -->"));
300        assert!(buf.contains("text"));
301        assert!(buf.contains("more"));
302    }
303
304    #[test]
305    fn test_serialize_inner_html() {
306        let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
307        let doc = soup.document();
308        let div = soup.find("div").unwrap().unwrap();
309
310        let mut buf = String::new();
311        serialize_inner_html(doc, div.node_id(), &mut buf);
312        assert_eq!(buf, "<span>A</span><span>B</span>");
313    }
314
315    #[test]
316    fn test_collect_text_simple() {
317        let soup = Soup::parse("<div>Hello World</div>");
318        let doc = soup.document();
319        let div = soup.find("div").unwrap().unwrap();
320
321        let mut buf = String::new();
322        collect_text(doc, div.node_id(), &mut buf);
323        assert_eq!(buf, "Hello World");
324    }
325
326    #[test]
327    fn test_collect_text_nested() {
328        let soup = Soup::parse("<div>Hello <b>Bold</b> Text</div>");
329        let doc = soup.document();
330        let div = soup.find("div").unwrap().unwrap();
331
332        let mut buf = String::new();
333        collect_text(doc, div.node_id(), &mut buf);
334        assert_eq!(buf, "Hello Bold Text");
335    }
336
337    #[test]
338    fn test_collect_text_skips_comments() {
339        let soup = Soup::parse("<div>text<!-- comment -->more</div>");
340        let doc = soup.document();
341        let div = soup.find("div").unwrap().unwrap();
342
343        let mut buf = String::new();
344        collect_text(doc, div.node_id(), &mut buf);
345        assert_eq!(buf, "textmore");
346    }
347
348    #[test]
349    fn test_collect_text_empty() {
350        let soup = Soup::parse("<div></div>");
351        let doc = soup.document();
352        let div = soup.find("div").unwrap().unwrap();
353
354        let mut buf = String::new();
355        collect_text(doc, div.node_id(), &mut buf);
356        assert_eq!(buf, "");
357    }
358
359    #[test]
360    fn test_html_serializer_serialize_html() {
361        let soup = Soup::parse("<div class=\"test\"><span>Hi</span></div>");
362        let div = soup.find("div").unwrap().unwrap();
363
364        let html = div.serialize_html();
365        assert!(html.starts_with("<div"));
366        assert!(html.ends_with("</div>"));
367        assert!(html.contains("<span>Hi</span>"));
368    }
369
370    #[test]
371    fn test_html_serializer_serialize_inner() {
372        let soup = Soup::parse("<div><span>A</span><span>B</span></div>");
373        let div = soup.find("div").unwrap().unwrap();
374
375        let inner = div.serialize_inner();
376        assert_eq!(inner, "<span>A</span><span>B</span>");
377    }
378
379    #[test]
380    fn test_html_serializer_extract_text() {
381        let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
382        let div = soup.find("div").unwrap().unwrap();
383
384        let text = div.extract_text();
385        assert_eq!(text, "Hello World!");
386    }
387
388    #[test]
389    fn test_html_serializer_buffer_reuse() {
390        let soup = Soup::parse("<div>Test</div>");
391        let div = soup.find("div").unwrap().unwrap();
392
393        let mut buf = String::with_capacity(100);
394        div.serialize_html_into(&mut buf);
395        let cap1 = buf.capacity();
396
397        buf.clear();
398        div.serialize_html_into(&mut buf);
399        let cap2 = buf.capacity();
400
401        assert_eq!(cap1, cap2); // No reallocation
402    }
403}