Skip to main content

fhp_tree/
lib.rs

1//! Arena-based DOM tree with cache-line aligned nodes.
2//!
3//! This crate builds on `fhp-tokenizer` to construct an in-memory DOM tree
4//! using an arena allocator. Each node occupies exactly 64 bytes (one cache
5//! line) for optimal traversal performance.
6//!
7//! # Quick Start
8//!
9//! ```
10//! use fhp_tree::parse;
11//!
12//! let doc = parse("<div><p>Hello</p></div>").unwrap();
13//! let root = doc.root();
14//! assert!(root.children().count() > 0);
15//! ```
16
17/// Arena allocator for [`Node`](crate::node::Node)s, text, and attributes.
18pub mod arena;
19/// Async HTML parser (requires `async-tokio` feature).
20#[cfg(feature = "async-tokio")]
21pub mod async_parser;
22/// Tree builder — converts a [`Token`](fhp_tokenizer::token::Token) stream into a DOM tree.
23pub mod builder;
24/// Cache-line aligned [`Node`](crate::node::Node) layout.
25pub mod node;
26/// Streaming and incremental parsing — [`StreamParser`](crate::streaming::StreamParser) and [`EarlyStopParser`](crate::streaming::EarlyStopParser).
27pub mod streaming;
28/// Allocation-free traversal iterators (uses [`VecDeque`](std::collections::VecDeque) for BFS).
29pub mod traverse;
30
31use fhp_core::tag::Tag;
32
33use arena::{Arena, Attribute};
34use builder::TreeBuilder;
35use node::{NodeFlags, NodeId};
36use traverse::{Ancestors, BreadthFirst, Children, DepthFirst, Siblings};
37
38/// Error type for HTML parsing.
39#[derive(Debug, thiserror::Error)]
40pub enum HtmlError {
41    /// Input was too large to parse.
42    #[error("input too large: {size} bytes (max {max})")]
43    InputTooLarge {
44        /// Actual input size.
45        size: usize,
46        /// Maximum allowed.
47        max: usize,
48    },
49
50    /// Encoding detection or conversion failed.
51    #[error("encoding error: {0}")]
52    Encoding(#[from] fhp_core::error::EncodingError),
53
54    /// I/O error during streaming or async parsing.
55    #[error("I/O error: {0}")]
56    Io(#[from] std::io::Error),
57}
58
59/// Maximum input size (256 MiB).
60pub(crate) const MAX_INPUT_SIZE: usize = 256 * 1024 * 1024;
61
62/// Parse an HTML string into a [`Document`].
63///
64/// Runs the tokenizer and tree builder in sequence.
65///
66/// # Errors
67///
68/// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
69///
70/// # Example
71///
72/// ```
73/// use fhp_tree::parse;
74///
75/// let doc = parse("<div><p>Hello &amp; world</p></div>").unwrap();
76/// let root = doc.root();
77/// assert!(root.children().count() > 0);
78/// ```
79pub fn parse(input: &str) -> Result<Document, HtmlError> {
80    if input.len() > MAX_INPUT_SIZE {
81        return Err(HtmlError::InputTooLarge {
82            size: input.len(),
83            max: MAX_INPUT_SIZE,
84        });
85    }
86
87    let mut builder = TreeBuilder::with_capacity_hint(input.len());
88    builder.set_source(input);
89    fhp_tokenizer::tokenize_into(input, &mut builder);
90    let (arena, root) = builder.finish();
91
92    Ok(Document { arena, root })
93}
94
95/// Parse an owned `String` into a [`Document`], transferring the allocation.
96///
97/// Unlike [`parse`], this avoids copying the input bytes into the arena's
98/// source buffer — the `String`'s allocation is transferred directly.
99/// Use this when the caller already owns the input (e.g., from an HTTP
100/// response body).
101///
102/// # Errors
103///
104/// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
105///
106/// # Example
107///
108/// ```
109/// use fhp_tree::parse_owned;
110///
111/// let html = String::from("<div><p>Hello</p></div>");
112/// let doc = parse_owned(html).unwrap();
113/// assert_eq!(doc.root().text_content(), "Hello");
114/// ```
115pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
116    if input.len() > MAX_INPUT_SIZE {
117        return Err(HtmlError::InputTooLarge {
118            size: input.len(),
119            max: MAX_INPUT_SIZE,
120        });
121    }
122
123    let mut builder = TreeBuilder::with_capacity_hint(input.len());
124    // Set source pointer tracking without copying data.
125    builder.set_source_ptr(&input);
126    fhp_tokenizer::tokenize_into(&input, &mut builder);
127    let (mut arena, root) = builder.finish();
128    // Transfer the String's allocation directly — no memcpy.
129    arena.set_source_owned(input);
130
131    Ok(Document { arena, root })
132}
133
134/// Parse raw bytes into a [`Document`], auto-detecting the encoding.
135///
136/// The encoding detection pipeline:
137/// 1. BOM detection (UTF-8, UTF-16 LE/BE)
138/// 2. `<meta charset="...">` prescan (first 1 KB)
139/// 3. `<meta http-equiv="Content-Type" content="...charset=...">` prescan
140/// 4. Fallback to UTF-8
141///
142/// # Errors
143///
144/// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB, or
145/// [`HtmlError::Encoding`] if the detected encoding cannot decode the input.
146///
147/// # Example
148///
149/// ```
150/// use fhp_tree::parse_bytes;
151///
152/// let doc = parse_bytes(b"<div>Hello</div>").unwrap();
153/// let root = doc.root();
154/// assert_eq!(root.text_content(), "Hello");
155/// ```
156pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
157    if input.len() > MAX_INPUT_SIZE {
158        return Err(HtmlError::InputTooLarge {
159            size: input.len(),
160            max: MAX_INPUT_SIZE,
161        });
162    }
163
164    let (text, _encoding) = fhp_encoding::decode_or_detect(input)?;
165    parse(&text)
166}
167
168/// A parsed HTML document backed by an arena.
169///
170/// Provides access to the root node and convenience methods for querying
171/// the DOM tree.
172pub struct Document {
173    arena: Arena,
174    root: NodeId,
175}
176
177impl Document {
178    /// Get a reference to the root node.
179    pub fn root(&self) -> NodeRef<'_> {
180        NodeRef {
181            arena: &self.arena,
182            id: self.root,
183        }
184    }
185
186    /// Get a node by its id.
187    ///
188    /// # Panics
189    ///
190    /// Panics if `id` is out of bounds.
191    pub fn get(&self, id: NodeId) -> NodeRef<'_> {
192        NodeRef {
193            arena: &self.arena,
194            id,
195        }
196    }
197
198    /// Get the underlying arena (for advanced usage).
199    pub fn arena(&self) -> &Arena {
200        &self.arena
201    }
202
203    /// Serialize the entire document back to an HTML string.
204    ///
205    /// This produces the outer HTML of the root node, which includes all
206    /// children with proper entity escaping for text and attribute values.
207    ///
208    /// # Example
209    ///
210    /// ```
211    /// use fhp_tree::parse;
212    ///
213    /// let doc = parse("<p>Hello &amp; world</p>").unwrap();
214    /// let html = doc.to_html();
215    /// assert!(html.contains("&amp;"));
216    /// ```
217    pub fn to_html(&self) -> String {
218        self.root().outer_html()
219    }
220
221    /// Total number of nodes in the document.
222    pub fn node_count(&self) -> usize {
223        self.arena.len()
224    }
225
226    /// Root node id.
227    pub fn root_id(&self) -> NodeId {
228        self.root
229    }
230}
231
232/// A borrowed reference to a node inside the document.
233///
234/// Provides convenience methods for querying node properties,
235/// traversing the tree, and extracting content.
236#[derive(Clone, Copy)]
237pub struct NodeRef<'a> {
238    arena: &'a Arena,
239    id: NodeId,
240}
241
242impl<'a> NodeRef<'a> {
243    /// The node id.
244    pub fn id(&self) -> NodeId {
245        self.id
246    }
247
248    /// The tag type of this node.
249    pub fn tag(&self) -> Tag {
250        self.arena.get(self.id).tag
251    }
252
253    /// The nesting depth.
254    pub fn depth(&self) -> u16 {
255        self.arena.get(self.id).depth
256    }
257
258    /// Whether this is a text node.
259    pub fn is_text(&self) -> bool {
260        self.arena.get(self.id).flags.has(NodeFlags::IS_TEXT)
261    }
262
263    /// Whether this is a comment node.
264    pub fn is_comment(&self) -> bool {
265        self.arena.get(self.id).flags.has(NodeFlags::IS_COMMENT)
266    }
267
268    /// Whether this is a doctype node.
269    pub fn is_doctype(&self) -> bool {
270        self.arena.get(self.id).flags.has(NodeFlags::IS_DOCTYPE)
271    }
272
273    /// Whether this is a void element.
274    pub fn is_void(&self) -> bool {
275        self.arena.get(self.id).flags.has(NodeFlags::IS_VOID)
276    }
277
278    /// Whether this node has any children.
279    pub fn has_children(&self) -> bool {
280        !self.arena.get(self.id).first_child.is_null()
281    }
282
283    /// Direct text content of this node (not recursive).
284    ///
285    /// For element nodes, returns `""`. For text nodes, returns the text.
286    pub fn text(&self) -> &'a str {
287        let node = self.arena.get(self.id);
288        if node.flags.has(NodeFlags::IS_TEXT)
289            || node.flags.has(NodeFlags::IS_COMMENT)
290            || node.flags.has(NodeFlags::IS_DOCTYPE)
291        {
292            self.arena.text(self.id)
293        } else {
294            ""
295        }
296    }
297
298    /// Recursively collect all text content from this node and its
299    /// descendants.
300    pub fn text_content(&self) -> String {
301        let node = self.arena.get(self.id);
302        // Fast path for text nodes.
303        if node.flags.has(NodeFlags::IS_TEXT) {
304            return self.arena.text(self.id).to_string();
305        }
306        // Heuristic: estimate based on text slab size, capped at 4KB.
307        let hint = (self.arena.text_slab.len() / 4).min(4096);
308        let mut result = String::with_capacity(hint);
309        self.collect_text(&mut result);
310        result
311    }
312
313    /// Recursive text collection helper.
314    fn collect_text(&self, out: &mut String) {
315        let node = self.arena.get(self.id);
316        if node.flags.has(NodeFlags::IS_TEXT) {
317            out.push_str(self.arena.text(self.id));
318            return;
319        }
320        let mut child = node.first_child;
321        while !child.is_null() {
322            NodeRef {
323                arena: self.arena,
324                id: child,
325            }
326            .collect_text(out);
327            child = self.arena.get(child).next_sibling;
328        }
329    }
330
331    /// Reconstruct the inner HTML of this node.
332    pub fn inner_html(&self) -> String {
333        let mut result = String::new();
334        let node = self.arena.get(self.id);
335        let mut child = node.first_child;
336        while !child.is_null() {
337            NodeRef {
338                arena: self.arena,
339                id: child,
340            }
341            .write_outer_html(&mut result);
342            child = self.arena.get(child).next_sibling;
343        }
344        result
345    }
346
347    /// Reconstruct the outer HTML of this node (including the tag itself).
348    pub fn outer_html(&self) -> String {
349        let mut result = String::new();
350        self.write_outer_html(&mut result);
351        result
352    }
353
354    /// Write outer HTML to a string buffer.
355    fn write_outer_html(&self, out: &mut String) {
356        let node = self.arena.get(self.id);
357
358        if node.flags.has(NodeFlags::IS_TEXT) {
359            let text = self.arena.text(self.id);
360            // Raw text elements (script/style) must not be escaped per HTML spec.
361            let parent_id = node.parent;
362            let is_raw_text = !parent_id.is_null() && self.arena.get(parent_id).tag.is_raw_text();
363            if is_raw_text {
364                out.push_str(text);
365            } else {
366                fhp_core::entity::escape_text(text, out);
367            }
368            return;
369        }
370
371        if node.flags.has(NodeFlags::IS_COMMENT) {
372            out.push_str("<!--");
373            out.push_str(self.arena.text(self.id));
374            out.push_str("-->");
375            return;
376        }
377
378        if node.flags.has(NodeFlags::IS_DOCTYPE) {
379            out.push_str("<!DOCTYPE ");
380            out.push_str(self.arena.text(self.id));
381            out.push('>');
382            return;
383        }
384
385        let tag_name = node
386            .tag
387            .as_str()
388            .or_else(|| self.arena.unknown_tag_name(self.id));
389        // Skip only the synthetic root node tag created by the builder.
390        let is_root_wrapper = node.depth == 0 && node.parent.is_null();
391
392        if !is_root_wrapper {
393            if let Some(name) = tag_name {
394                out.push('<');
395                out.push_str(name);
396
397                // Write attributes.
398                let attrs = self.arena.attrs(self.id);
399                for attr in attrs {
400                    out.push(' ');
401                    out.push_str(self.arena.attr_name(attr));
402                    if let Some(val) = self.arena.attr_value(attr) {
403                        out.push_str("=\"");
404                        fhp_core::entity::escape_attr(val, out);
405                        out.push('"');
406                    }
407                }
408
409                if node.flags.has(NodeFlags::IS_VOID) {
410                    out.push('>');
411                    return;
412                }
413                out.push('>');
414            }
415        }
416
417        // Write children.
418        let mut child = node.first_child;
419        while !child.is_null() {
420            NodeRef {
421                arena: self.arena,
422                id: child,
423            }
424            .write_outer_html(out);
425            child = self.arena.get(child).next_sibling;
426        }
427
428        // Close tag.
429        if !is_root_wrapper {
430            if let Some(name) = tag_name {
431                out.push_str("</");
432                out.push_str(name);
433                out.push('>');
434            }
435        }
436    }
437
438    /// Get the value of an attribute by name.
439    pub fn attr(&self, name: &str) -> Option<&'a str> {
440        self.arena
441            .attrs(self.id)
442            .iter()
443            .find(|a| self.arena.attr_name(a).eq_ignore_ascii_case(name))
444            .and_then(|a| self.arena.attr_value(a))
445    }
446
447    /// Check if the node has a given CSS class.
448    ///
449    /// Splits the `class` attribute on whitespace and checks if any
450    /// segment matches.
451    pub fn has_class(&self, class_name: &str) -> bool {
452        if let Some(classes) = self.attr("class") {
453            classes.split_whitespace().any(|c| c == class_name)
454        } else {
455            false
456        }
457    }
458
459    /// Get all attributes.
460    pub fn attrs(&self) -> &'a [Attribute] {
461        self.arena.attrs(self.id)
462    }
463
464    /// Iterate over direct children.
465    pub fn children(&self) -> Children<'a> {
466        Children::new(self.arena, self.id)
467    }
468
469    /// Get the parent node, if any.
470    pub fn parent(&self) -> Option<NodeRef<'a>> {
471        let parent = self.arena.get(self.id).parent;
472        if parent.is_null() {
473            None
474        } else {
475            Some(NodeRef {
476                arena: self.arena,
477                id: parent,
478            })
479        }
480    }
481
482    /// Get the first child, if any.
483    pub fn first_child(&self) -> Option<NodeRef<'a>> {
484        let fc = self.arena.get(self.id).first_child;
485        if fc.is_null() {
486            None
487        } else {
488            Some(NodeRef {
489                arena: self.arena,
490                id: fc,
491            })
492        }
493    }
494
495    /// Get the next sibling, if any.
496    pub fn next_sibling(&self) -> Option<NodeRef<'a>> {
497        let ns = self.arena.get(self.id).next_sibling;
498        if ns.is_null() {
499            None
500        } else {
501            Some(NodeRef {
502                arena: self.arena,
503                id: ns,
504            })
505        }
506    }
507
508    /// Get the previous sibling, if any.
509    pub fn prev_sibling(&self) -> Option<NodeRef<'a>> {
510        let ps = self.arena.get(self.id).prev_sibling;
511        if ps.is_null() {
512            None
513        } else {
514            Some(NodeRef {
515                arena: self.arena,
516                id: ps,
517            })
518        }
519    }
520
521    /// Iterate over ancestors (parent chain, not including self).
522    pub fn ancestors(&self) -> Ancestors<'a> {
523        Ancestors::new(self.arena, self.id)
524    }
525
526    /// Iterate over next siblings (not including self).
527    pub fn siblings(&self) -> Siblings<'a> {
528        Siblings::new(self.arena, self.id)
529    }
530
531    /// Pre-order depth-first traversal of the subtree rooted at this node.
532    pub fn descendants(&self) -> DepthFirst<'a> {
533        DepthFirst::new(self.arena, self.id)
534    }
535
536    /// Breadth-first traversal of the subtree rooted at this node.
537    pub fn descendants_bfs(&self) -> BreadthFirst<'a> {
538        BreadthFirst::new(self.arena, self.id)
539    }
540}
541
542impl<'a> core::fmt::Debug for NodeRef<'a> {
543    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
544        let node = self.arena.get(self.id);
545        if node.flags.has(NodeFlags::IS_TEXT) {
546            write!(f, "Text({:?})", self.text())
547        } else if node.flags.has(NodeFlags::IS_COMMENT) {
548            write!(f, "Comment({:?})", self.text())
549        } else {
550            write!(f, "<{}>", node.tag)
551        }
552    }
553}
554
555#[cfg(test)]
556mod tests {
557    use super::*;
558
559    #[test]
560    fn parse_simple() {
561        let doc = parse("<div><p>Hello</p></div>").unwrap();
562        assert!(doc.node_count() > 0);
563        let root = doc.root();
564        assert!(root.has_children());
565    }
566
567    #[test]
568    fn parse_text_content() {
569        let doc = parse("<div><span>Hello</span> <span>World</span></div>").unwrap();
570        let root = doc.root();
571        let text = root.text_content();
572        assert!(text.contains("Hello"), "text: {text}");
573        assert!(text.contains("World"), "text: {text}");
574    }
575
576    #[test]
577    fn parse_attr() {
578        let doc = parse("<a href=\"https://example.com\" class=\"link primary\">text</a>").unwrap();
579        let root = doc.root();
580        // Root -> a
581        let a = root.first_child().expect("should have child");
582        assert_eq!(a.tag(), Tag::A);
583        assert_eq!(a.attr("href"), Some("https://example.com"));
584        assert!(a.has_class("link"));
585        assert!(a.has_class("primary"));
586        assert!(!a.has_class("secondary"));
587    }
588
589    #[test]
590    fn parse_attr_case_insensitive_name_lookup() {
591        let doc = parse("<a HREF=\"https://example.com\" CLASS=\"link primary\">text</a>").unwrap();
592        let root = doc.root();
593        let a = root.first_child().expect("should have child");
594
595        assert_eq!(a.attr("href"), Some("https://example.com"));
596        assert_eq!(a.attr("HREF"), Some("https://example.com"));
597        assert!(a.has_class("link"));
598        assert!(a.has_class("primary"));
599    }
600
601    #[test]
602    fn parse_inner_html() {
603        let doc = parse("<div><p>Hello</p></div>").unwrap();
604        let root = doc.root();
605        let div = root.first_child().unwrap();
606        assert_eq!(div.tag(), Tag::Div);
607        let inner = div.inner_html();
608        assert!(inner.contains("<p>"), "inner: {inner}");
609        assert!(inner.contains("Hello"), "inner: {inner}");
610        assert!(inner.contains("</p>"), "inner: {inner}");
611    }
612
613    #[test]
614    fn parse_outer_html() {
615        let doc = parse("<div><p>Hello</p></div>").unwrap();
616        let root = doc.root();
617        let div = root.first_child().unwrap();
618        let outer = div.outer_html();
619        assert!(outer.starts_with("<div>"), "outer: {outer}");
620        assert!(outer.ends_with("</div>"), "outer: {outer}");
621    }
622
623    #[test]
624    fn parse_void_elements() {
625        let doc = parse("<div><br><hr></div>").unwrap();
626        let root = doc.root();
627        let div = root.first_child().unwrap();
628        let children: Vec<_> = div.children().collect();
629        assert_eq!(children.len(), 2);
630
631        let br_ref = doc.get(children[0]);
632        assert_eq!(br_ref.tag(), Tag::Br);
633        assert!(br_ref.is_void());
634
635        let hr_ref = doc.get(children[1]);
636        assert_eq!(hr_ref.tag(), Tag::Hr);
637        assert!(hr_ref.is_void());
638    }
639
640    #[test]
641    fn parse_depth_first() {
642        let doc = parse("<div><span>a</span><p>b</p></div>").unwrap();
643        let root = doc.root();
644        let tags: Vec<_> = root
645            .descendants()
646            .map(|id| doc.get(id))
647            .filter(|n| !n.is_text())
648            .map(|n| n.tag())
649            .collect();
650        // root(Unknown), div, span, p
651        assert!(tags.contains(&Tag::Div));
652        assert!(tags.contains(&Tag::Span));
653        assert!(tags.contains(&Tag::P));
654    }
655
656    #[test]
657    fn parse_ancestors() {
658        let doc = parse("<div><span><a>link</a></span></div>").unwrap();
659        let root = doc.root();
660
661        // Navigate: root -> div -> span -> a -> text
662        let div = root.first_child().unwrap();
663        let span = div.first_child().unwrap();
664        let a = span.first_child().unwrap();
665
666        let ancestor_tags: Vec<_> = a.ancestors().map(|id| doc.get(id).tag()).collect();
667        assert_eq!(ancestor_tags, vec![Tag::Span, Tag::Div, Tag::Unknown]);
668    }
669
670    #[test]
671    fn parse_siblings() {
672        let doc = parse("<ul><li>1</li><li>2</li><li>3</li></ul>").unwrap();
673        let root = doc.root();
674        let ul = root.first_child().unwrap();
675        let li1 = ul.first_child().unwrap();
676
677        let sibling_count = li1.siblings().count();
678        assert_eq!(sibling_count, 2);
679    }
680
681    #[test]
682    fn empty_input() {
683        let doc = parse("").unwrap();
684        assert!(!doc.root().has_children());
685    }
686
687    #[test]
688    fn text_only() {
689        let doc = parse("just text").unwrap();
690        assert_eq!(doc.root().text_content(), "just text");
691    }
692
693    #[test]
694    fn broken_html_unclosed() {
695        let doc = parse("<div><p>unclosed").unwrap();
696        let root = doc.root();
697        assert!(root.has_children());
698        assert_eq!(root.text_content(), "unclosed");
699    }
700
701    #[test]
702    fn broken_html_extra_close() {
703        let doc = parse("</div><p>ok</p>").unwrap();
704        let root = doc.root();
705        assert_eq!(root.text_content(), "ok");
706    }
707
708    #[test]
709    fn implicit_close_p_p() {
710        let doc = parse("<p>first<p>second").unwrap();
711        let root = doc.root();
712        let children: Vec<_> = root.children().collect();
713        // Both <p> should be direct children of root.
714        let p_count = children
715            .iter()
716            .filter(|&c| doc.get(*c).tag() == Tag::P)
717            .count();
718        assert_eq!(p_count, 2, "both <p> should be root children");
719    }
720
721    #[test]
722    fn node_64_bytes_alignment() {
723        assert_eq!(std::mem::size_of::<node::Node>(), 64);
724        assert_eq!(std::mem::align_of::<node::Node>(), 64);
725    }
726
727    #[test]
728    fn input_too_large() {
729        // We can't actually allocate 256 MiB in a test, but check the error path.
730        let result = parse("");
731        assert!(result.is_ok());
732    }
733
734    #[test]
735    fn comment_and_doctype() {
736        let doc = parse("<!DOCTYPE html><!-- comment --><div>ok</div>").unwrap();
737        let root = doc.root();
738        let mut has_comment = false;
739        let mut has_doctype = false;
740        for child_id in root.children() {
741            let child = doc.get(child_id);
742            if child.is_comment() {
743                has_comment = true;
744            }
745            if child.is_doctype() {
746                has_doctype = true;
747            }
748        }
749        assert!(has_doctype, "should have doctype");
750        assert!(has_comment, "should have comment");
751    }
752
753    #[test]
754    fn void_outer_html() {
755        let doc = parse("<br>").unwrap();
756        let root = doc.root();
757        let br = root.first_child().unwrap();
758        let html = br.outer_html();
759        assert_eq!(html, "<br>", "outer: {html}");
760    }
761
762    #[test]
763    fn unknown_tag_outer_html_preserved() {
764        let doc = parse("<my-widget><x-item>ok</x-item></my-widget>").unwrap();
765        let root = doc.root();
766        let outer = root.inner_html();
767        assert_eq!(outer, "<my-widget><x-item>ok</x-item></my-widget>");
768    }
769
770    // ---- parse_bytes tests ----
771
772    #[test]
773    fn parse_bytes_utf8() {
774        let doc = parse_bytes(b"<div><p>Hello</p></div>").unwrap();
775        assert_eq!(doc.root().text_content(), "Hello");
776    }
777
778    #[test]
779    fn parse_bytes_utf8_bom() {
780        let html = b"\xEF\xBB\xBF<div><p>BOM test</p></div>";
781        let doc = parse_bytes(html).unwrap();
782        assert!(doc.root().text_content().contains("BOM test"));
783    }
784
785    #[test]
786    fn parse_bytes_windows_1254_meta() {
787        // Turkish ü=0xFC in windows-1254.
788        let html = b"<meta charset=\"windows-1254\"><p>Merhaba d\xFCnya</p>";
789        let doc = parse_bytes(html).unwrap();
790        let text = doc.root().text_content();
791        assert!(text.contains("dünya"), "text: {text}");
792    }
793
794    #[test]
795    fn parse_bytes_utf16le_bom() {
796        let mut bytes = vec![0xFF, 0xFE]; // BOM
797        for &ch in b"<p>UTF16</p>" {
798            bytes.push(ch);
799            bytes.push(0x00);
800        }
801        let doc = parse_bytes(&bytes).unwrap();
802        let text = doc.root().text_content();
803        assert!(text.contains("UTF16"), "text: {text}");
804    }
805
806    #[test]
807    fn parse_bytes_empty() {
808        let doc = parse_bytes(b"").unwrap();
809        assert!(!doc.root().has_children());
810    }
811
812    // ---- serialization / escaping tests ----
813
814    #[test]
815    fn text_escaping_in_inner_html() {
816        // Input uses entities; parser decodes them; serializer must re-encode.
817        let doc = parse("<p>1 &lt; 2 &amp; 3 &gt; 0</p>").unwrap();
818        let p = doc.root().first_child().unwrap();
819        assert_eq!(p.text_content(), "1 < 2 & 3 > 0");
820        let inner = p.inner_html();
821        assert_eq!(inner, "1 &lt; 2 &amp; 3 &gt; 0");
822    }
823
824    #[test]
825    fn attr_escaping_in_outer_html() {
826        let doc = parse("<a href=\"x&y\">link</a>").unwrap();
827        let a = doc.root().first_child().unwrap();
828        let outer = a.outer_html();
829        assert!(
830            outer.contains("x&amp;y"),
831            "attribute value should be escaped: {outer}"
832        );
833    }
834
835    #[test]
836    fn script_raw_text_not_escaped() {
837        let doc = parse("<script>if (a < b && c > d) {}</script>").unwrap();
838        let script = doc.root().first_child().unwrap();
839        let inner = script.inner_html();
840        assert_eq!(inner, "if (a < b && c > d) {}");
841    }
842
843    #[test]
844    fn style_raw_text_not_escaped() {
845        let doc = parse("<style>a > b { color: red; }</style>").unwrap();
846        let style = doc.root().first_child().unwrap();
847        let inner = style.inner_html();
848        assert_eq!(inner, "a > b { color: red; }");
849    }
850
851    #[test]
852    fn void_elements_no_closing_slash() {
853        let doc = parse("<div><br><img src=\"x.png\"><hr></div>").unwrap();
854        let div = doc.root().first_child().unwrap();
855        let inner = div.inner_html();
856        assert!(inner.contains("<br>"), "br: {inner}");
857        assert!(inner.contains("<img "), "img: {inner}");
858        assert!(inner.contains("<hr>"), "hr: {inner}");
859        assert!(!inner.contains("/>"), "should not contain />: {inner}");
860    }
861
862    #[test]
863    fn comment_not_escaped() {
864        let doc = parse("<!-- <b>not bold</b> & stuff -->").unwrap();
865        let html = doc.to_html();
866        assert!(
867            html.contains("<!-- <b>not bold</b> & stuff -->"),
868            "comment should be verbatim: {html}"
869        );
870    }
871
872    #[test]
873    fn doctype_not_escaped() {
874        let doc = parse("<!DOCTYPE html><p>ok</p>").unwrap();
875        let html = doc.to_html();
876        assert!(
877            html.contains("<!DOCTYPE html>"),
878            "doctype should be verbatim: {html}"
879        );
880    }
881
882    #[test]
883    fn document_to_html() {
884        let doc = parse("<!DOCTYPE html><html><body><p>Hello</p></body></html>").unwrap();
885        let html = doc.to_html();
886        assert!(html.contains("<!DOCTYPE html>"), "html: {html}");
887        assert!(html.contains("<p>Hello</p>"), "html: {html}");
888    }
889
890    #[test]
891    fn round_trip_structure() {
892        let input = "<div><p>Hello</p><span>World</span></div>";
893        let doc1 = parse(input).unwrap();
894        let html = doc1.to_html();
895        let doc2 = parse(&html).unwrap();
896        assert_eq!(doc1.root().text_content(), doc2.root().text_content());
897        assert_eq!(doc1.node_count(), doc2.node_count());
898    }
899
900    #[test]
901    fn round_trip_with_special_chars() {
902        let input = "<p>1 &lt; 2 &amp; 3 &gt; 0</p>";
903        let doc1 = parse(input).unwrap();
904        assert_eq!(doc1.root().text_content(), "1 < 2 & 3 > 0");
905
906        let html = doc1.to_html();
907        let doc2 = parse(&html).unwrap();
908        assert_eq!(doc2.root().text_content(), "1 < 2 & 3 > 0");
909    }
910
911    #[test]
912    fn unknown_tag_preserved_in_to_html() {
913        let doc = parse("<my-widget>content</my-widget>").unwrap();
914        let html = doc.to_html();
915        assert!(html.contains("<my-widget>"), "html: {html}");
916        assert!(html.contains("</my-widget>"), "html: {html}");
917    }
918}