Skip to main content

fhp_tree/
arena.rs

1//! Arena allocator for DOM nodes, text, and attributes.
2//!
3//! All nodes live in a single contiguous `Vec<Node>`, giving cache-friendly
4//! traversal. Text content and attributes are stored in separate slabs,
5//! referenced by offset+length from each [`Node`](crate::node::Node).
6
7use fhp_core::hash::{class_bloom_bit, selector_hash};
8use fhp_core::tag::Tag;
9
10use crate::node::{Node, NodeFlags, NodeId};
11
12/// A compact attribute stored in the attribute slab.
13///
14/// Names and values are stored as offsets into `Arena::attr_str_slab` rather
15/// than separate heap allocations. Use [`Arena::attr_name`] and
16/// [`Arena::attr_value`] to access the strings.
17#[derive(Clone, Debug)]
18pub struct Attribute {
19    name_offset: u32,
20    name_len: u16,
21    value_offset: u32,
22    value_len: u16,
23}
24
25/// Number of tag index buckets (Tag is `repr(u8)`, 256 possible values).
26const TAG_INDEX_SIZE: usize = 256;
27
28/// Arena-based storage for all DOM nodes, text content, and attributes.
29///
30/// Nodes are stored in a contiguous `Vec<Node>` for cache-line-friendly access.
31/// Text and attributes are stored in separate slabs and referenced by
32/// offset+length from each node.
33pub struct Arena {
34    /// All nodes in insertion order.
35    pub(crate) nodes: Vec<Node>,
36    /// All text content concatenated (for entity-decoded or owned text).
37    pub(crate) text_slab: Vec<u8>,
38    /// All attributes in insertion order.
39    pub(crate) attr_slab: Vec<Attribute>,
40    /// All attribute name and value bytes concatenated.
41    pub(crate) attr_str_slab: Vec<u8>,
42    /// Owned copy of the original input source.
43    ///
44    /// Text nodes that reference entity-free (borrowed) regions of the input
45    /// store offsets into this buffer via [`NodeFlags::IS_TEXT_FROM_SOURCE`].
46    /// Empty for streaming parsers.
47    pub(crate) source: Vec<u8>,
48    /// Pre-built tag → NodeId index, populated during tree construction.
49    ///
50    /// Indexed by `Tag as u8`. Each bucket contains NodeIds of elements with
51    /// that tag, in document order. Built inline during `open_tag` to avoid
52    /// a separate DFS pass.
53    pub(crate) tag_index: Option<Box<[Vec<NodeId>; TAG_INDEX_SIZE]>>,
54}
55
56impl Arena {
57    /// Create a new empty arena.
58    pub fn new() -> Self {
59        Self {
60            nodes: Vec::new(),
61            text_slab: Vec::new(),
62            attr_slab: Vec::new(),
63            attr_str_slab: Vec::new(),
64            source: Vec::new(),
65            tag_index: None,
66        }
67    }
68
69    /// Create a new arena with pre-allocated capacity.
70    pub fn with_capacity(node_cap: usize, text_cap: usize, attr_cap: usize) -> Self {
71        Self {
72            nodes: Vec::with_capacity(node_cap),
73            text_slab: Vec::with_capacity(text_cap),
74            attr_slab: Vec::with_capacity(attr_cap),
75            attr_str_slab: Vec::with_capacity(attr_cap * 32),
76            source: Vec::new(),
77            tag_index: None,
78        }
79    }
80
81    /// Enable the inline tag index.
82    ///
83    /// Once enabled, every [`Arena::new_element`] call appends the node id to
84    /// the corresponding tag bucket. Consumers can retrieve the index via
85    /// [`Arena::tag_index`].
86    pub fn enable_tag_index(&mut self) {
87        if self.tag_index.is_none() {
88            // Use a boxed array to avoid 256 * 24 = 6 KB on the stack.
89            self.tag_index = Some(Box::new(std::array::from_fn(|_| Vec::new())));
90        }
91    }
92
93    /// Get the pre-built tag index, if it was enabled during construction.
94    pub fn tag_index(&self) -> Option<&[Vec<NodeId>; TAG_INDEX_SIZE]> {
95        self.tag_index.as_deref()
96    }
97
98    /// Allocate a new element node and return its id.
99    pub fn new_element(&mut self, tag: Tag, depth: u16) -> NodeId {
100        let id = NodeId(self.nodes.len() as u32);
101        self.nodes.push(Node::new_element(tag, depth));
102        // Populate inline tag index if enabled.
103        if let Some(ref mut idx) = self.tag_index {
104            idx[tag as u8 as usize].push(id);
105        }
106        id
107    }
108
109    /// Store the original tag name for an unknown/custom element.
110    pub fn set_unknown_tag_name(&mut self, node: NodeId, tag_name: &str) {
111        if tag_name.is_empty() || self.nodes[node.index()].tag != Tag::Unknown {
112            return;
113        }
114        let offset = self.text_slab.len() as u32;
115        let len = tag_name.len() as u32;
116        self.text_slab.extend_from_slice(tag_name.as_bytes());
117        let n = &mut self.nodes[node.index()];
118        n.text_offset = offset;
119        n.text_len = len;
120    }
121
122    /// Allocate a new text node, storing content in the text slab.
123    pub fn new_text(&mut self, depth: u16, text: &str) -> NodeId {
124        let offset = self.text_slab.len() as u32;
125        let len = text.len() as u32;
126        self.text_slab.extend_from_slice(text.as_bytes());
127        let id = NodeId(self.nodes.len() as u32);
128        self.nodes.push(Node::new_text(depth, offset, len));
129        id
130    }
131
132    /// Allocate a text node that references a region of the original source.
133    ///
134    /// Instead of copying content to the text slab, this stores a
135    /// `(source_offset, len)` pair and sets [`NodeFlags::IS_TEXT_FROM_SOURCE`].
136    /// The source must have been set via [`Arena::set_source`] before calling
137    /// this method.
138    pub fn new_text_ref(&mut self, depth: u16, source_offset: u32, len: u32) -> NodeId {
139        let id = NodeId(self.nodes.len() as u32);
140        let mut node = Node::new_text(depth, source_offset, len);
141        node.flags.set(NodeFlags::IS_TEXT_FROM_SOURCE);
142        self.nodes.push(node);
143        id
144    }
145
146    /// Store an owned copy of the input source for source-backed text nodes.
147    pub fn set_source(&mut self, input: &str) {
148        self.source = input.as_bytes().to_vec();
149    }
150
151    /// Transfer an already-owned `String` as the source buffer (zero copy).
152    ///
153    /// When the caller owns the input `String` (e.g., from an HTTP response),
154    /// this avoids the memcpy that [`set_source`](Arena::set_source) performs.
155    pub fn set_source_owned(&mut self, source: String) {
156        self.source = source.into_bytes();
157    }
158
159    /// Allocate a new comment node, storing content in the text slab.
160    pub fn new_comment(&mut self, depth: u16, text: &str) -> NodeId {
161        let offset = self.text_slab.len() as u32;
162        let len = text.len() as u32;
163        self.text_slab.extend_from_slice(text.as_bytes());
164        let id = NodeId(self.nodes.len() as u32);
165        self.nodes.push(Node::new_comment(depth, offset, len));
166        id
167    }
168
169    /// Allocate a new doctype node, storing content in the text slab.
170    pub fn new_doctype(&mut self, depth: u16, text: &str) -> NodeId {
171        let offset = self.text_slab.len() as u32;
172        let len = text.len() as u32;
173        self.text_slab.extend_from_slice(text.as_bytes());
174        let id = NodeId(self.nodes.len() as u32);
175        self.nodes.push(Node::new_doctype(depth, offset, len));
176        id
177    }
178
179    /// Set attributes for a node from tokenizer attributes.
180    pub fn set_attrs(&mut self, node: NodeId, attrs: &[fhp_tokenizer::token::Attribute<'_>]) {
181        if attrs.is_empty() {
182            return;
183        }
184        let offset = self.attr_slab.len() as u32;
185        let count = attrs.len().min(255) as u8;
186
187        for attr in &attrs[..count as usize] {
188            let name_offset = self.attr_str_slab.len() as u32;
189            self.attr_str_slab.extend_from_slice(attr.name.as_bytes());
190            let name_len = attr.name.len() as u16;
191
192            let (value_offset, value_len) = if let Some(ref v) = attr.value {
193                let vo = self.attr_str_slab.len() as u32;
194                self.attr_str_slab.extend_from_slice(v.as_bytes());
195                (vo, v.len() as u16)
196            } else {
197                (0, 0)
198            };
199
200            self.attr_slab.push(Attribute {
201                name_offset,
202                name_len,
203                value_offset,
204                value_len,
205            });
206        }
207
208        let n = &mut self.nodes[node.index()];
209        n.attr_offset = offset;
210        n.attr_count = count;
211        n.flags.set(NodeFlags::HAS_ATTRS);
212
213        // Compute class_hash and id_hash from the just-added attributes.
214        self.compute_node_hashes(node, offset, count);
215    }
216
217    /// Store raw attribute bytes for lazy parsing.
218    ///
219    /// Instead of parsing attributes immediately, stores the raw attribute
220    /// region in `attr_str_slab` and records offset/len on the node. Attributes
221    /// are parsed on first access via [`Arena::attrs`] or
222    /// [`Arena::ensure_attrs_parsed`].
223    pub fn set_attrs_raw_lazy(&mut self, node: NodeId, attr_raw: &str) {
224        let trimmed = attr_raw.as_bytes();
225        // Quick scan: skip if all whitespace.
226        let has_content = trimmed.iter().any(|&b| !is_attr_whitespace(b));
227        if !has_content {
228            return;
229        }
230        let offset = self.attr_str_slab.len() as u32;
231        self.attr_str_slab.extend_from_slice(trimmed);
232        let n = &mut self.nodes[node.index()];
233        n.attr_raw_offset = offset;
234        n.attr_raw_len = trimmed.len().min(u16::MAX as usize) as u16;
235        n.flags.set(NodeFlags::HAS_ATTRS);
236    }
237
238    /// Parse attributes directly from a raw attribute region into the slab.
239    ///
240    /// Skips all intermediate `Vec<Attribute>` allocation — names and values
241    /// are written directly to `attr_str_slab` and compact `Attribute` structs
242    /// are pushed to `attr_slab` in a single pass.
243    pub fn set_attrs_from_raw(&mut self, node: NodeId, attr_raw: &str) {
244        let bytes = attr_raw.as_bytes();
245        let end = bytes.len();
246        if end == 0 {
247            return;
248        }
249
250        let slab_offset = self.attr_slab.len() as u32;
251        let mut count: u8 = 0;
252        let mut pos = 0;
253
254        loop {
255            // Skip whitespace using fast byte scan.
256            pos += bytes[pos..end]
257                .iter()
258                .position(|&b| !is_attr_whitespace(b))
259                .unwrap_or(end - pos);
260            if pos >= end || count == 255 {
261                break;
262            }
263
264            // Attribute name.
265            let name_start = pos;
266            while pos < end && !is_attr_name_end(bytes[pos]) {
267                pos += 1;
268            }
269            if name_start == pos {
270                // Not a valid name char — skip it.
271                pos += 1;
272                continue;
273            }
274
275            let name_slab_offset = self.attr_str_slab.len() as u32;
276            self.attr_str_slab
277                .extend_from_slice(&bytes[name_start..pos]);
278            let name_len = (pos - name_start) as u16;
279
280            // Skip whitespace using fast byte scan.
281            pos += bytes[pos..end]
282                .iter()
283                .position(|&b| !is_attr_whitespace(b))
284                .unwrap_or(end - pos);
285
286            // Check for `=`.
287            if pos < end && bytes[pos] == b'=' {
288                pos += 1;
289
290                // Skip whitespace using fast byte scan.
291                pos += bytes[pos..end]
292                    .iter()
293                    .position(|&b| !is_attr_whitespace(b))
294                    .unwrap_or(end - pos);
295
296                // Parse value.
297                if pos < end && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
298                    // Quoted value — use memchr for SIMD-accelerated scan.
299                    let quote = bytes[pos];
300                    pos += 1;
301                    let val_start = pos;
302                    if let Some(found) = memchr::memchr(quote, &bytes[pos..end]) {
303                        pos += found;
304                    } else {
305                        pos = end;
306                    }
307                    let val_end = pos;
308                    if pos < end {
309                        pos += 1; // skip closing quote
310                    }
311                    let raw_value = &attr_raw[val_start..val_end];
312                    let (value_offset, value_len) = self.push_attr_value(raw_value);
313                    self.attr_slab.push(Attribute {
314                        name_offset: name_slab_offset,
315                        name_len,
316                        value_offset,
317                        value_len,
318                    });
319                } else {
320                    // Unquoted value.
321                    let val_start = pos;
322                    while pos < end && !is_attr_whitespace(bytes[pos]) && bytes[pos] != b'>' {
323                        pos += 1;
324                    }
325                    let raw_value = &attr_raw[val_start..pos];
326                    let (value_offset, value_len) = self.push_attr_value(raw_value);
327                    self.attr_slab.push(Attribute {
328                        name_offset: name_slab_offset,
329                        name_len,
330                        value_offset,
331                        value_len,
332                    });
333                }
334            } else {
335                // Boolean attribute (no value).
336                self.attr_slab.push(Attribute {
337                    name_offset: name_slab_offset,
338                    name_len,
339                    value_offset: 0,
340                    value_len: 0,
341                });
342            }
343
344            count += 1;
345        }
346
347        if count > 0 {
348            let n = &mut self.nodes[node.index()];
349            n.attr_offset = slab_offset;
350            n.attr_count = count;
351            n.flags.set(NodeFlags::HAS_ATTRS);
352
353            // Compute class_hash (bloom) and id_hash (exact) from parsed attrs.
354            self.compute_node_hashes(node, slab_offset, count);
355        }
356    }
357
358    /// Compute class bloom hash and id hash from a node's just-parsed attributes.
359    ///
360    /// Scans the attribute slab for `class` and `id` attributes and stores
361    /// the computed hashes on the node for fast selector rejection.
362    fn compute_node_hashes(&mut self, node: NodeId, slab_offset: u32, count: u8) {
363        let mut class_hash: u64 = 0;
364        let mut id_hash: u32 = 0;
365        let start = slab_offset as usize;
366        let end = start + count as usize;
367
368        for i in start..end {
369            let attr = &self.attr_slab[i];
370            let name_start = attr.name_offset as usize;
371            let name_end = name_start + attr.name_len as usize;
372            let name_bytes = &self.attr_str_slab[name_start..name_end];
373
374            if name_bytes.eq_ignore_ascii_case(b"class") && attr.value_len > 0 {
375                let val_start = attr.value_offset as usize;
376                let val_end = val_start + attr.value_len as usize;
377                let val = &self.attr_str_slab[val_start..val_end];
378                // Build bloom: OR in a bit for each whitespace-separated token.
379                let mut pos = 0;
380                while pos < val.len() {
381                    // Skip whitespace.
382                    while pos < val.len() && val[pos].is_ascii_whitespace() {
383                        pos += 1;
384                    }
385                    let token_start = pos;
386                    while pos < val.len() && !val[pos].is_ascii_whitespace() {
387                        pos += 1;
388                    }
389                    if pos > token_start {
390                        class_hash |= class_bloom_bit(&val[token_start..pos]);
391                    }
392                }
393            } else if name_bytes.eq_ignore_ascii_case(b"id") && attr.value_len > 0 {
394                let val_start = attr.value_offset as usize;
395                let val_end = val_start + attr.value_len as usize;
396                id_hash = selector_hash(&self.attr_str_slab[val_start..val_end]);
397            }
398        }
399
400        let n = &mut self.nodes[node.index()];
401        n.class_hash = class_hash;
402        n.id_hash = id_hash;
403    }
404
405    /// Write an attribute value to the string slab, with optional entity decoding.
406    #[cfg(feature = "entity-decode")]
407    fn push_attr_value(&mut self, raw_value: &str) -> (u32, u16) {
408        let offset = self.attr_str_slab.len() as u32;
409        let decoded = fhp_tokenizer::entity::decode_entities(raw_value);
410        self.attr_str_slab.extend_from_slice(decoded.as_bytes());
411        (offset, decoded.len() as u16)
412    }
413
414    /// Write an attribute value to the string slab (no entity decoding).
415    #[cfg(not(feature = "entity-decode"))]
416    fn push_attr_value(&mut self, raw_value: &str) -> (u32, u16) {
417        let offset = self.attr_str_slab.len() as u32;
418        self.attr_str_slab.extend_from_slice(raw_value.as_bytes());
419        (offset, raw_value.len() as u16)
420    }
421
422    /// Set the 1-based element sibling index for a node.
423    ///
424    /// Called by [`TreeBuilder`] after appending an element child.
425    #[inline]
426    pub fn set_element_index(&mut self, node: NodeId, index: u16) {
427        self.nodes[node.index()].element_index = index;
428    }
429
430    /// Set the self-closing flag on a node.
431    pub fn set_self_closing(&mut self, node: NodeId) {
432        self.nodes[node.index()]
433            .flags
434            .set(NodeFlags::IS_SELF_CLOSING);
435    }
436
437    /// Append `child` as the last child of `parent`.
438    ///
439    /// Updates all tree links: parent, first_child, last_child, prev_sibling,
440    /// next_sibling. Uses unchecked indexing since NodeIds are always valid
441    /// indices created by this arena.
442    pub fn append_child(&mut self, parent: NodeId, child: NodeId) {
443        let nodes = self.nodes.as_mut_ptr();
444        // SAFETY: parent and child indices were created by this arena via
445        // new_element/new_text/etc., so they are always in bounds.
446        unsafe {
447            let p = &mut *nodes.add(parent.index());
448            let c = &mut *nodes.add(child.index());
449            let last = p.last_child;
450            c.parent = parent;
451            if last.is_null() {
452                p.first_child = child;
453            } else {
454                (*nodes.add(last.index())).next_sibling = child;
455                c.prev_sibling = last;
456            }
457            p.last_child = child;
458            p.flags.set(NodeFlags::HAS_CHILDREN);
459        }
460    }
461
462    /// Get the name of an attribute.
463    #[inline]
464    pub fn attr_name(&self, attr: &Attribute) -> &str {
465        let start = attr.name_offset as usize;
466        let end = start + attr.name_len as usize;
467        // SAFETY: attr names are sourced from tokenizer `&str` slices (valid UTF-8).
468        unsafe { std::str::from_utf8_unchecked(&self.attr_str_slab[start..end]) }
469    }
470
471    /// Get the value of an attribute, or `None` for boolean attributes.
472    #[inline]
473    pub fn attr_value(&self, attr: &Attribute) -> Option<&str> {
474        if attr.value_len == 0 {
475            return None;
476        }
477        let start = attr.value_offset as usize;
478        let end = start + attr.value_len as usize;
479        // SAFETY: attr values are sourced from tokenizer `&str` slices (valid UTF-8).
480        Some(unsafe { std::str::from_utf8_unchecked(&self.attr_str_slab[start..end]) })
481    }
482
483    /// Get the attributes for a node (read-only, requires prior parse).
484    ///
485    /// If the node has lazy (unparsed) attributes, returns an empty slice.
486    /// Call [`Arena::ensure_attrs_parsed`] first to guarantee parsing.
487    #[inline]
488    pub fn attrs(&self, node: NodeId) -> &[Attribute] {
489        let n = &self.nodes[node.index()];
490        if n.attr_count == 0 {
491            return &[];
492        }
493        let start = n.attr_offset as usize;
494        let end = start + n.attr_count as usize;
495        &self.attr_slab[start..end]
496    }
497
498    /// Returns `true` if the node has lazy (unparsed) attributes.
499    #[inline]
500    pub fn has_lazy_attrs(&self, node: NodeId) -> bool {
501        let n = &self.nodes[node.index()];
502        n.attr_count == 0 && n.attr_raw_len > 0
503    }
504
505    /// Ensure the node's attributes are parsed into the slab.
506    ///
507    /// If the node has lazy attributes (stored as raw bytes), parses them
508    /// now and updates the node's attr_offset/attr_count. No-op if already
509    /// parsed or if the node has no attributes.
510    pub fn ensure_attrs_parsed(&mut self, node: NodeId) {
511        let n = &self.nodes[node.index()];
512        if n.attr_count > 0 || n.attr_raw_len == 0 {
513            return;
514        }
515        let raw_offset = n.attr_raw_offset as usize;
516        let raw_len = n.attr_raw_len as usize;
517        // SAFETY: attr_str_slab bytes were originally sourced from &str (valid UTF-8).
518        let attr_raw = unsafe {
519            std::str::from_utf8_unchecked(&self.attr_str_slab[raw_offset..raw_offset + raw_len])
520        }
521        .to_owned();
522        self.parse_raw_attrs_into_slab(node, &attr_raw);
523    }
524
525    /// Get the attributes for a node, parsing lazily if needed.
526    ///
527    /// Convenience method that combines [`Arena::ensure_attrs_parsed`] and
528    /// [`Arena::attrs`].
529    pub fn attrs_mut(&mut self, node: NodeId) -> &[Attribute] {
530        self.ensure_attrs_parsed(node);
531        let n = &self.nodes[node.index()];
532        if n.attr_count == 0 {
533            return &[];
534        }
535        let start = n.attr_offset as usize;
536        let end = start + n.attr_count as usize;
537        &self.attr_slab[start..end]
538    }
539
540    /// Parse raw attribute bytes into the slab (internal helper for lazy parsing).
541    fn parse_raw_attrs_into_slab(&mut self, node: NodeId, attr_raw: &str) {
542        let bytes = attr_raw.as_bytes();
543        let end = bytes.len();
544        if end == 0 {
545            return;
546        }
547
548        let slab_offset = self.attr_slab.len() as u32;
549        let mut count: u8 = 0;
550        let mut pos = 0;
551
552        loop {
553            pos += bytes[pos..end]
554                .iter()
555                .position(|&b| !is_attr_whitespace(b))
556                .unwrap_or(end - pos);
557            if pos >= end || count == 255 {
558                break;
559            }
560
561            let name_start = pos;
562            while pos < end && !is_attr_name_end(bytes[pos]) {
563                pos += 1;
564            }
565            if name_start == pos {
566                pos += 1;
567                continue;
568            }
569
570            let name_slab_offset = self.attr_str_slab.len() as u32;
571            self.attr_str_slab
572                .extend_from_slice(&bytes[name_start..pos]);
573            let name_len = (pos - name_start) as u16;
574
575            pos += bytes[pos..end]
576                .iter()
577                .position(|&b| !is_attr_whitespace(b))
578                .unwrap_or(end - pos);
579
580            if pos < end && bytes[pos] == b'=' {
581                pos += 1;
582
583                pos += bytes[pos..end]
584                    .iter()
585                    .position(|&b| !is_attr_whitespace(b))
586                    .unwrap_or(end - pos);
587
588                if pos < end && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
589                    let quote = bytes[pos];
590                    pos += 1;
591                    let val_start = pos;
592                    if let Some(found) = memchr::memchr(quote, &bytes[pos..end]) {
593                        pos += found;
594                    } else {
595                        pos = end;
596                    }
597                    let val_end = pos;
598                    if pos < end {
599                        pos += 1;
600                    }
601                    let raw_value = &attr_raw[val_start..val_end];
602                    let (value_offset, value_len) = self.push_attr_value(raw_value);
603                    self.attr_slab.push(Attribute {
604                        name_offset: name_slab_offset,
605                        name_len,
606                        value_offset,
607                        value_len,
608                    });
609                } else {
610                    let val_start = pos;
611                    while pos < end && !is_attr_whitespace(bytes[pos]) && bytes[pos] != b'>' {
612                        pos += 1;
613                    }
614                    let raw_value = &attr_raw[val_start..pos];
615                    let (value_offset, value_len) = self.push_attr_value(raw_value);
616                    self.attr_slab.push(Attribute {
617                        name_offset: name_slab_offset,
618                        name_len,
619                        value_offset,
620                        value_len,
621                    });
622                }
623            } else {
624                self.attr_slab.push(Attribute {
625                    name_offset: name_slab_offset,
626                    name_len,
627                    value_offset: 0,
628                    value_len: 0,
629                });
630            }
631
632            count += 1;
633        }
634
635        if count > 0 {
636            let n = &mut self.nodes[node.index()];
637            n.attr_offset = slab_offset;
638            n.attr_count = count;
639
640            // Compute hashes for lazy-parsed attributes.
641            self.compute_node_hashes(node, slab_offset, count);
642        }
643    }
644
645    /// Get the text content for a node (direct text, not recursive).
646    #[inline]
647    pub fn text(&self, node: NodeId) -> &str {
648        let n = &self.nodes[node.index()];
649        if n.text_len == 0 {
650            return "";
651        }
652        let start = n.text_offset as usize;
653        let end = start + n.text_len as usize;
654        if n.flags.has(NodeFlags::IS_TEXT) && n.flags.has(NodeFlags::IS_TEXT_FROM_SOURCE) {
655            // SAFETY: source is a copy of the input &str (valid UTF-8).
656            unsafe { std::str::from_utf8_unchecked(&self.source[start..end]) }
657        } else {
658            // SAFETY: text slab is always valid UTF-8 (we only write str bytes).
659            unsafe { std::str::from_utf8_unchecked(&self.text_slab[start..end]) }
660        }
661    }
662
663    /// Get the preserved name for an unknown/custom element.
664    #[inline]
665    pub fn unknown_tag_name(&self, node: NodeId) -> Option<&str> {
666        let n = &self.nodes[node.index()];
667        if n.tag != Tag::Unknown || n.text_len == 0 {
668            return None;
669        }
670        let start = n.text_offset as usize;
671        let end = start + n.text_len as usize;
672        // SAFETY: the tag name is sourced from tokenizer `&str` slices.
673        Some(unsafe { std::str::from_utf8_unchecked(&self.text_slab[start..end]) })
674    }
675
676    /// Get a reference to a node by id.
677    #[inline]
678    pub fn get(&self, id: NodeId) -> &Node {
679        &self.nodes[id.index()]
680    }
681
682    /// Get a mutable reference to a node by id.
683    #[inline]
684    pub fn get_mut(&mut self, id: NodeId) -> &mut Node {
685        &mut self.nodes[id.index()]
686    }
687
688    /// Parse all lazy (unparsed) attributes in a single batch pass.
689    ///
690    /// Called after tree building completes to resolve all deferred attribute
691    /// parsing before handing the arena to consumers. This gives better cache
692    /// locality than interleaving attr parsing with node creation.
693    pub fn ensure_all_attrs_parsed(&mut self) {
694        for i in 0..self.nodes.len() {
695            let n = &self.nodes[i];
696            if n.attr_count == 0 && n.attr_raw_len > 0 {
697                let raw_offset = n.attr_raw_offset as usize;
698                let raw_len = n.attr_raw_len as usize;
699                // SAFETY: attr_str_slab bytes were sourced from &str (valid UTF-8).
700                let attr_raw = unsafe {
701                    std::str::from_utf8_unchecked(
702                        &self.attr_str_slab[raw_offset..raw_offset + raw_len],
703                    )
704                }
705                .to_owned();
706                let node_id = NodeId(i as u32);
707                self.parse_raw_attrs_into_slab(node_id, &attr_raw);
708            }
709        }
710    }
711
712    /// Total number of nodes in the arena.
713    #[inline]
714    pub fn len(&self) -> usize {
715        self.nodes.len()
716    }
717
718    /// Returns `true` if the arena contains no nodes.
719    #[inline]
720    pub fn is_empty(&self) -> bool {
721        self.nodes.is_empty()
722    }
723}
724
725impl Default for Arena {
726    fn default() -> Self {
727        Self::new()
728    }
729}
730
731/// Check if a byte is ASCII whitespace (for attribute parsing).
732#[inline(always)]
733fn is_attr_whitespace(b: u8) -> bool {
734    matches!(b, b' ' | b'\t' | b'\n' | b'\r')
735}
736
737/// Check if a byte terminates an attribute name.
738#[inline(always)]
739fn is_attr_name_end(b: u8) -> bool {
740    matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'=' | b'/' | b'>')
741}
742
743#[cfg(test)]
744mod tests {
745    use super::*;
746    use std::borrow::Cow;
747
748    #[test]
749    fn new_element_and_get() {
750        let mut arena = Arena::new();
751        let id = arena.new_element(Tag::Div, 0);
752        assert_eq!(id, NodeId(0));
753        assert_eq!(arena.get(id).tag, Tag::Div);
754        assert_eq!(arena.get(id).depth, 0);
755    }
756
757    #[test]
758    fn new_text_and_read() {
759        let mut arena = Arena::new();
760        let id = arena.new_text(1, "hello world");
761        assert!(arena.get(id).flags.has(NodeFlags::IS_TEXT));
762        assert_eq!(arena.text(id), "hello world");
763    }
764
765    #[test]
766    fn append_child_single() {
767        let mut arena = Arena::new();
768        let parent = arena.new_element(Tag::Div, 0);
769        let child = arena.new_element(Tag::Span, 1);
770        arena.append_child(parent, child);
771
772        assert_eq!(arena.get(parent).first_child, child);
773        assert_eq!(arena.get(parent).last_child, child);
774        assert_eq!(arena.get(child).parent, parent);
775        assert!(arena.get(child).next_sibling.is_null());
776        assert!(arena.get(child).prev_sibling.is_null());
777    }
778
779    #[test]
780    fn append_child_multiple() {
781        let mut arena = Arena::new();
782        let parent = arena.new_element(Tag::Div, 0);
783        let c1 = arena.new_element(Tag::Span, 1);
784        let c2 = arena.new_element(Tag::P, 1);
785        let c3 = arena.new_element(Tag::A, 1);
786
787        arena.append_child(parent, c1);
788        arena.append_child(parent, c2);
789        arena.append_child(parent, c3);
790
791        assert_eq!(arena.get(parent).first_child, c1);
792        assert_eq!(arena.get(parent).last_child, c3);
793
794        assert_eq!(arena.get(c1).next_sibling, c2);
795        assert!(arena.get(c1).prev_sibling.is_null());
796
797        assert_eq!(arena.get(c2).prev_sibling, c1);
798        assert_eq!(arena.get(c2).next_sibling, c3);
799
800        assert_eq!(arena.get(c3).prev_sibling, c2);
801        assert!(arena.get(c3).next_sibling.is_null());
802    }
803
804    #[test]
805    fn attrs_roundtrip() {
806        use fhp_tokenizer::token::Attribute as TokAttr;
807
808        let mut arena = Arena::new();
809        let id = arena.new_element(Tag::A, 0);
810
811        let tok_attrs = vec![
812            TokAttr {
813                name: Cow::Borrowed("href"),
814                value: Some(Cow::Borrowed("https://example.com")),
815            },
816            TokAttr {
817                name: Cow::Borrowed("class"),
818                value: Some(Cow::Borrowed("link")),
819            },
820        ];
821        arena.set_attrs(id, &tok_attrs);
822
823        let attrs = arena.attrs(id);
824        assert_eq!(attrs.len(), 2);
825        assert_eq!(arena.attr_name(&attrs[0]), "href");
826        assert_eq!(arena.attr_value(&attrs[0]), Some("https://example.com"));
827        assert_eq!(arena.attr_name(&attrs[1]), "class");
828        assert_eq!(arena.attr_value(&attrs[1]), Some("link"));
829    }
830
831    #[test]
832    fn empty_attrs() {
833        let mut arena = Arena::new();
834        let id = arena.new_element(Tag::Div, 0);
835        assert!(arena.attrs(id).is_empty());
836    }
837
838    #[test]
839    fn arena_len() {
840        let mut arena = Arena::new();
841        assert!(arena.is_empty());
842        arena.new_element(Tag::Div, 0);
843        arena.new_text(1, "hi");
844        assert_eq!(arena.len(), 2);
845    }
846}