Skip to main content

fhp_tree/
builder.rs

1//! Tree construction from a token stream.
2//!
3//! [`TreeBuilder`](crate::builder::TreeBuilder) consumes [`Token`](fhp_tokenizer::token::Token)s and builds an arena-based DOM tree.
4//! It handles implicit close rules, void elements, and common malformed-HTML
5//! recovery strategies.
6
7use fhp_core::tag::Tag;
8use fhp_tokenizer::token::Token;
9
10use crate::arena::Arena;
11use crate::node::NodeId;
12
13/// Maximum nesting depth to prevent stack overflow on pathological input.
14const MAX_DEPTH: u16 = 512;
15
16/// Implicit close lookup table.
17///
18/// `IMPLICIT_CLOSE[open_tag][new_tag]` is `true` when the arrival of
19/// `new_tag` should implicitly close the currently-open `open_tag`.
20///
21/// Only a subset of tags participate in implicit closing. Tags are mapped
22/// to compact indices 0..=7 via [`implicit_close_index`].
23const IMPLICIT_CLOSE_SIZE: usize = 8;
24static IMPLICIT_CLOSE: [[bool; IMPLICIT_CLOSE_SIZE]; IMPLICIT_CLOSE_SIZE] = {
25    let mut table = [[false; IMPLICIT_CLOSE_SIZE]; IMPLICIT_CLOSE_SIZE];
26
27    // <p> is closed by another block-level element.
28    let p = 0usize;
29    let li = 1;
30    let td = 2;
31    let th = 3;
32    let tr = 4;
33    let thead = 5;
34    let tbody = 6;
35    let option = 7;
36
37    // <p> closed by <p>, <div>, <h1>-<h6>, <ul>, <ol>, <table>, <pre>, etc.
38    // Simplified: <p> closed by <p>
39    table[p][p] = true;
40
41    // <li> closed by <li>
42    table[li][li] = true;
43
44    // <td> closed by <td> or <th>
45    table[td][td] = true;
46    table[td][th] = true;
47
48    // <th> closed by <td> or <th>
49    table[th][td] = true;
50    table[th][th] = true;
51
52    // <tr> closed by <tr>
53    table[tr][tr] = true;
54
55    // <thead> closed by <tbody>
56    table[thead][tbody] = true;
57
58    // <tbody> closed by <tbody>
59    table[tbody][tbody] = true;
60
61    // <option> closed by <option>
62    table[option][option] = true;
63
64    table
65};
66
67/// Map a tag to its implicit close table index, or `None` if it doesn't
68/// participate in implicit closing.
69fn implicit_close_index(tag: Tag) -> Option<usize> {
70    match tag {
71        Tag::P => Some(0),
72        Tag::Li => Some(1),
73        Tag::Td => Some(2),
74        Tag::Th => Some(3),
75        Tag::Tr => Some(4),
76        Tag::Thead => Some(5),
77        Tag::Tbody => Some(6),
78        // Option tag not in Tag enum — skip
79        _ => None,
80    }
81}
82
83/// Check whether `new_tag` should implicitly close `open_tag`.
84#[inline]
85fn should_implicit_close(open_tag: Tag, new_tag: Tag) -> bool {
86    if let (Some(open_idx), Some(new_idx)) = (
87        implicit_close_index(open_tag),
88        implicit_close_index(new_tag),
89    ) {
90        IMPLICIT_CLOSE[open_idx][new_idx]
91    } else {
92        false
93    }
94}
95
96/// Builds a DOM tree from a token stream.
97///
98/// Maintains an open-elements stack and processes each token to either
99/// push new nodes, pop closed elements, or append text/comment/doctype nodes.
100pub struct TreeBuilder {
101    /// The arena that owns all nodes.
102    pub(crate) arena: Arena,
103    /// Stack of open element node ids with cached tags and element child count.
104    open_elements: Vec<(NodeId, Tag, u16)>,
105    /// The synthetic root node (document root).
106    root: NodeId,
107    /// Base address of the original input (for source-backed text).
108    source_base: usize,
109    /// Length of the original input in bytes.
110    source_len: usize,
111}
112
113impl TreeBuilder {
114    /// Create a new tree builder with default capacity.
115    pub fn new() -> Self {
116        Self::with_capacity_hint(0)
117    }
118
119    /// Create a tree builder with capacity tuned to the expected input size.
120    ///
121    /// Uses heuristics to estimate node, text, and attribute counts from the
122    /// input byte length, reducing reallocations for large documents.
123    pub fn with_capacity_hint(input_len: usize) -> Self {
124        let node_cap = (input_len / 32).max(256);
125        // Source-backed text uses offsets, not slab — smaller alloc suffices.
126        let text_cap = (input_len / 16).max(4096);
127        let attr_cap = (input_len / 128).max(64);
128        let mut arena = Arena::with_capacity(node_cap, text_cap, attr_cap);
129        // Create a synthetic document root.
130        let root = arena.new_element(Tag::Unknown, 0);
131        let mut open_elements = Vec::with_capacity(32);
132        open_elements.push((root, Tag::Unknown, 0));
133        Self {
134            arena,
135            open_elements,
136            root,
137            source_base: 0,
138            source_len: 0,
139        }
140    }
141
142    /// Enable the inline tag index for O(1) tag lookups after parsing.
143    ///
144    /// When enabled, each element's tag is indexed during tree construction,
145    /// eliminating the need for a separate DFS pass in
146    /// [`DocumentIndex::build`](crate::arena::Arena::tag_index).
147    pub fn enable_tag_index(&mut self) {
148        self.arena.enable_tag_index();
149    }
150
151    /// Enable source-backed text nodes.
152    ///
153    /// Stores an owned copy of `input` in the arena. Text nodes whose content
154    /// is borrowed from `input` (entity-free) will reference the source
155    /// instead of copying to the text slab.
156    pub fn set_source(&mut self, input: &str) {
157        self.source_base = input.as_ptr() as usize;
158        self.source_len = input.len();
159        self.arena.set_source(input);
160    }
161
162    /// Set source pointer tracking without copying data to the arena.
163    ///
164    /// Use this with [`Arena::set_source_owned`] after tokenization to
165    /// avoid a redundant memcpy when the caller owns the input `String`.
166    pub fn set_source_ptr(&mut self, input: &str) {
167        self.source_base = input.as_ptr() as usize;
168        self.source_len = input.len();
169    }
170
171    /// Process a single token and insert it into the tree.
172    ///
173    /// Returns the [`NodeId`] of the newly created node, or `None` if the
174    /// token did not produce a node (e.g. a close tag or depth limit hit).
175    #[inline]
176    pub fn process(&mut self, token: &Token<'_>) -> Option<NodeId> {
177        match token {
178            Token::OpenTag {
179                tag,
180                name,
181                attributes,
182                self_closing,
183                ..
184            } => self.handle_open_tag(*tag, name.as_ref(), attributes, *self_closing),
185            Token::CloseTag { tag, name } => {
186                self.handle_close_tag(*tag, name.as_ref());
187                None
188            }
189            Token::Text { content } => self.handle_text(content.as_ref()),
190            Token::Comment { content } => self.handle_comment(content.as_ref()),
191            Token::Doctype { content } => self.handle_doctype(content.as_ref()),
192            Token::CData { content } => {
193                // Treat CDATA as text.
194                self.handle_text(content.as_ref())
195            }
196        }
197    }
198
199    /// Finish building and return the root node id and the arena.
200    pub fn finish(self) -> (Arena, NodeId) {
201        // Any unclosed elements are implicitly closed (they stay in the tree).
202        (self.arena, self.root)
203    }
204
205    /// Current parent node (top of open_elements stack).
206    #[inline]
207    fn current_parent(&self) -> NodeId {
208        self.open_elements
209            .last()
210            .map(|&(id, _, _)| id)
211            .unwrap_or(self.root)
212    }
213
214    /// Current depth.
215    #[inline]
216    fn current_depth(&self) -> u16 {
217        (self.open_elements.len() as u16).min(MAX_DEPTH)
218    }
219
220    /// Handle an open tag token.
221    fn handle_open_tag(
222        &mut self,
223        tag: Tag,
224        name: &str,
225        attributes: &[fhp_tokenizer::token::Attribute<'_>],
226        self_closing: bool,
227    ) -> Option<NodeId> {
228        // Apply implicit close rules.
229        self.apply_implicit_close(tag);
230
231        // Enforce depth limit.
232        if self.current_depth() >= MAX_DEPTH {
233            return None;
234        }
235
236        let depth = self.current_depth();
237        let parent = self.current_parent();
238        let node = self.arena.new_element(tag, depth);
239        if tag == Tag::Unknown {
240            self.arena.set_unknown_tag_name(node, name);
241        }
242
243        // Set attributes.
244        if !attributes.is_empty() {
245            self.arena.set_attrs(node, attributes);
246        }
247
248        // Append to parent and set element index from parent's counter.
249        self.arena.append_child(parent, node);
250        if let Some(parent_entry) = self.open_elements.last_mut() {
251            parent_entry.2 += 1;
252            self.arena.set_element_index(node, parent_entry.2);
253        }
254
255        // Void elements and self-closing tags don't go on the stack.
256        if tag.is_void() || self_closing {
257            if self_closing {
258                self.arena.set_self_closing(node);
259            }
260            // Void elements are always marked self-closing in the tree.
261            if tag.is_void() {
262                self.arena.set_self_closing(node);
263            }
264        } else {
265            self.open_elements.push((node, tag, 0));
266        }
267
268        Some(node)
269    }
270
271    /// Handle a close tag token.
272    fn handle_close_tag(&mut self, tag: Tag, name: &str) {
273        // Ignore close tags for void elements.
274        if tag.is_void() {
275            return;
276        }
277
278        // Find the matching open element on the stack.
279        // Walk backwards to find the nearest match — read tag from cached tuple.
280        let mut match_idx = None;
281        for i in (1..self.open_elements.len()).rev() {
282            let (open_id, open_tag, _) = self.open_elements[i];
283            if open_tag == tag {
284                if tag == Tag::Unknown {
285                    let open_name = self.arena.unknown_tag_name(open_id).unwrap_or("");
286                    if !open_name.eq_ignore_ascii_case(name) {
287                        continue;
288                    }
289                }
290                match_idx = Some(i);
291                break;
292            }
293        }
294
295        if let Some(idx) = match_idx {
296            // Pop everything from idx onwards (implicitly closing).
297            self.open_elements.truncate(idx);
298        }
299        // If no match found, ignore the close tag (broken HTML recovery).
300    }
301
302    /// Handle text content (already entity-decoded by the tokenizer).
303    ///
304    /// If the text is borrowed from the original source (pointer falls within
305    /// the source range), creates a source-backed text node to avoid copying.
306    fn handle_text(&mut self, content: &str) -> Option<NodeId> {
307        if content.is_empty() {
308            return None;
309        }
310        let depth = self.current_depth();
311        let parent = self.current_parent();
312        let node = self.try_source_ref(depth, content);
313        self.arena.append_child(parent, node);
314        Some(node)
315    }
316
317    /// Handle raw text from TreeSink (not entity-decoded).
318    ///
319    /// Performs entity decoding if the `entity-decode` feature is enabled,
320    /// then creates a source-backed or slab text node.
321    fn handle_raw_text(&mut self, raw: &str) -> Option<NodeId> {
322        if raw.is_empty() {
323            return None;
324        }
325        let depth = self.current_depth();
326        let parent = self.current_parent();
327
328        #[cfg(feature = "entity-decode")]
329        let node = {
330            let decoded = fhp_tokenizer::entity::decode_entities(raw);
331            match decoded {
332                std::borrow::Cow::Borrowed(s) => self.try_source_ref(depth, s),
333                std::borrow::Cow::Owned(s) => self.arena.new_text(depth, &s),
334            }
335        };
336
337        #[cfg(not(feature = "entity-decode"))]
338        let node = self.try_source_ref(depth, raw);
339
340        self.arena.append_child(parent, node);
341        Some(node)
342    }
343
344    /// Create a text node, using a source-backed ref if the pointer is within
345    /// the original source range.
346    #[inline]
347    fn try_source_ref(&mut self, depth: u16, content: &str) -> NodeId {
348        if self.source_len > 0 {
349            let ptr = content.as_ptr() as usize;
350            if ptr >= self.source_base && ptr + content.len() <= self.source_base + self.source_len
351            {
352                let offset = ptr - self.source_base;
353                return self
354                    .arena
355                    .new_text_ref(depth, offset as u32, content.len() as u32);
356            }
357        }
358        self.arena.new_text(depth, content)
359    }
360
361    /// Handle a comment.
362    fn handle_comment(&mut self, content: &str) -> Option<NodeId> {
363        let depth = self.current_depth();
364        let parent = self.current_parent();
365        let node = self.arena.new_comment(depth, content);
366        self.arena.append_child(parent, node);
367        Some(node)
368    }
369
370    /// Handle a doctype declaration.
371    fn handle_doctype(&mut self, content: &str) -> Option<NodeId> {
372        let depth = self.current_depth();
373        let parent = self.current_parent();
374        let node = self.arena.new_doctype(depth, content);
375        self.arena.append_child(parent, node);
376        Some(node)
377    }
378
379    /// Apply implicit close rules based on the new tag.
380    fn apply_implicit_close(&mut self, new_tag: Tag) {
381        // Check if the current open element should be implicitly closed.
382        // Tag is read from the cached tuple — no arena access needed.
383        while self.open_elements.len() > 1 {
384            let (_, current_tag, _) = *self.open_elements.last().unwrap();
385
386            if should_implicit_close(current_tag, new_tag) {
387                self.open_elements.pop();
388            } else {
389                break;
390            }
391        }
392    }
393}
394
395impl fhp_tokenizer::TreeSink for TreeBuilder {
396    fn open_tag(&mut self, tag: Tag, name: &str, attr_raw: &str, self_closing: bool) {
397        self.apply_implicit_close(tag);
398
399        if self.current_depth() >= MAX_DEPTH {
400            return;
401        }
402
403        let depth = self.current_depth();
404        let parent = self.current_parent();
405        let node = self.arena.new_element(tag, depth);
406        if tag == Tag::Unknown {
407            self.arena.set_unknown_tag_name(node, name);
408        }
409
410        // Parse attributes directly into the slab (no intermediate Vec).
411        self.arena.set_attrs_from_raw(node, attr_raw);
412
413        self.arena.append_child(parent, node);
414        if let Some(parent_entry) = self.open_elements.last_mut() {
415            parent_entry.2 += 1;
416            self.arena.set_element_index(node, parent_entry.2);
417        }
418
419        if tag.is_void() || self_closing {
420            if self_closing {
421                self.arena.set_self_closing(node);
422            }
423            if tag.is_void() {
424                self.arena.set_self_closing(node);
425            }
426        } else {
427            self.open_elements.push((node, tag, 0));
428        }
429    }
430
431    fn close_tag(&mut self, tag: Tag, name: &str) {
432        self.handle_close_tag(tag, name);
433    }
434
435    fn text(&mut self, raw: &str) {
436        self.handle_raw_text(raw);
437    }
438
439    fn comment(&mut self, content: &str) {
440        self.handle_comment(content);
441    }
442
443    fn doctype(&mut self, content: &str) {
444        self.handle_doctype(content);
445    }
446
447    fn cdata(&mut self, content: &str) {
448        // CDATA treated as text.
449        self.handle_raw_text(content);
450    }
451}
452
453impl Default for TreeBuilder {
454    fn default() -> Self {
455        Self::new()
456    }
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462    use crate::node::NodeFlags;
463    use std::borrow::Cow;
464
465    fn make_open(tag: Tag) -> Token<'static> {
466        Token::OpenTag {
467            tag,
468            name: Cow::Borrowed(tag.as_str().unwrap_or("unknown")),
469            attributes: vec![],
470            self_closing: false,
471        }
472    }
473
474    fn make_close(tag: Tag) -> Token<'static> {
475        Token::CloseTag {
476            tag,
477            name: Cow::Borrowed(tag.as_str().unwrap_or("unknown")),
478        }
479    }
480
481    fn make_text(content: &'static str) -> Token<'static> {
482        Token::Text {
483            content: Cow::Borrowed(content),
484        }
485    }
486
487    #[test]
488    fn simple_tree() {
489        let mut builder = TreeBuilder::new();
490        builder.process(&make_open(Tag::Div));
491        builder.process(&make_text("hello"));
492        builder.process(&make_close(Tag::Div));
493
494        let (arena, root) = builder.finish();
495
496        // Root should have one child (div).
497        let div = arena.get(root).first_child;
498        assert!(!div.is_null());
499        assert_eq!(arena.get(div).tag, Tag::Div);
500
501        // Div should have one child (text).
502        let text = arena.get(div).first_child;
503        assert!(!text.is_null());
504        assert!(arena.get(text).flags.has(NodeFlags::IS_TEXT));
505        assert_eq!(arena.text(text), "hello");
506    }
507
508    #[test]
509    fn void_element_not_pushed() {
510        let mut builder = TreeBuilder::new();
511        builder.process(&make_open(Tag::Div));
512        builder.process(&Token::OpenTag {
513            tag: Tag::Br,
514            name: Cow::Borrowed("br"),
515            attributes: vec![],
516            self_closing: false,
517        });
518        builder.process(&make_text("after br"));
519        builder.process(&make_close(Tag::Div));
520
521        let (arena, root) = builder.finish();
522        let div = arena.get(root).first_child;
523        let br = arena.get(div).first_child;
524        assert_eq!(arena.get(br).tag, Tag::Br);
525
526        // Text should be a sibling of br, not a child.
527        let text = arena.get(br).next_sibling;
528        assert!(!text.is_null());
529        assert_eq!(arena.text(text), "after br");
530    }
531
532    #[test]
533    fn implicit_close_p() {
534        let mut builder = TreeBuilder::new();
535        builder.process(&make_open(Tag::P));
536        builder.process(&make_text("first"));
537        builder.process(&make_open(Tag::P));
538        builder.process(&make_text("second"));
539        builder.process(&make_close(Tag::P));
540
541        let (arena, root) = builder.finish();
542
543        // Both <p> elements should be children of root.
544        let p1 = arena.get(root).first_child;
545        assert_eq!(arena.get(p1).tag, Tag::P);
546
547        let p2 = arena.get(p1).next_sibling;
548        assert!(!p2.is_null());
549        assert_eq!(arena.get(p2).tag, Tag::P);
550
551        // p1 has "first", p2 has "second".
552        assert_eq!(arena.text(arena.get(p1).first_child), "first");
553        assert_eq!(arena.text(arena.get(p2).first_child), "second");
554    }
555
556    #[test]
557    fn mismatched_close_finds_nearest() {
558        // <div><span></div> — should close both span and div.
559        let mut builder = TreeBuilder::new();
560        builder.process(&make_open(Tag::Div));
561        builder.process(&make_open(Tag::Span));
562        builder.process(&make_text("hi"));
563        builder.process(&make_close(Tag::Div));
564
565        let (arena, root) = builder.finish();
566        let div = arena.get(root).first_child;
567        assert_eq!(arena.get(div).tag, Tag::Div);
568    }
569
570    #[test]
571    fn extra_close_tag_ignored() {
572        let mut builder = TreeBuilder::new();
573        builder.process(&make_close(Tag::Div)); // No matching open — ignored.
574        builder.process(&make_open(Tag::P));
575        builder.process(&make_text("ok"));
576        builder.process(&make_close(Tag::P));
577
578        let (arena, root) = builder.finish();
579        let p = arena.get(root).first_child;
580        assert_eq!(arena.get(p).tag, Tag::P);
581    }
582
583    #[test]
584    fn unknown_close_matches_by_name() {
585        let mut builder = TreeBuilder::new();
586        builder.process(&Token::OpenTag {
587            tag: Tag::Unknown,
588            name: Cow::Borrowed("my-widget"),
589            attributes: vec![],
590            self_closing: false,
591        });
592        builder.process(&Token::OpenTag {
593            tag: Tag::Unknown,
594            name: Cow::Borrowed("x-item"),
595            attributes: vec![],
596            self_closing: false,
597        });
598        builder.process(&Token::CloseTag {
599            tag: Tag::Unknown,
600            name: Cow::Borrowed("my-widget"),
601        });
602
603        let (arena, root) = builder.finish();
604        let my_widget = arena.get(root).first_child;
605        let x_item = arena.get(my_widget).first_child;
606        assert_eq!(arena.unknown_tag_name(my_widget), Some("my-widget"));
607        assert_eq!(arena.unknown_tag_name(x_item), Some("x-item"));
608    }
609
610    #[test]
611    fn should_implicit_close_rules() {
612        assert!(should_implicit_close(Tag::P, Tag::P));
613        assert!(should_implicit_close(Tag::Li, Tag::Li));
614        assert!(should_implicit_close(Tag::Td, Tag::Td));
615        assert!(should_implicit_close(Tag::Td, Tag::Th));
616        assert!(should_implicit_close(Tag::Th, Tag::Td));
617        assert!(should_implicit_close(Tag::Tr, Tag::Tr));
618
619        assert!(!should_implicit_close(Tag::Div, Tag::Div));
620        assert!(!should_implicit_close(Tag::Span, Tag::Span));
621        assert!(!should_implicit_close(Tag::P, Tag::Span));
622    }
623}