1use fhp_core::hash::{class_bloom_bit, selector_hash};
8use fhp_core::tag::Tag;
9
10use crate::node::{Node, NodeFlags, NodeId};
11
12#[derive(Clone, Debug)]
18pub struct Attribute {
19 name_offset: u32,
20 name_len: u16,
21 value_offset: u32,
22 value_len: u16,
23}
24
25const TAG_INDEX_SIZE: usize = 256;
27
28pub struct Arena {
34 pub(crate) nodes: Vec<Node>,
36 pub(crate) text_slab: Vec<u8>,
38 pub(crate) attr_slab: Vec<Attribute>,
40 pub(crate) attr_str_slab: Vec<u8>,
42 pub(crate) source: Vec<u8>,
48 pub(crate) tag_index: Option<Box<[Vec<NodeId>; TAG_INDEX_SIZE]>>,
54}
55
56impl Arena {
57 pub fn new() -> Self {
59 Self {
60 nodes: Vec::new(),
61 text_slab: Vec::new(),
62 attr_slab: Vec::new(),
63 attr_str_slab: Vec::new(),
64 source: Vec::new(),
65 tag_index: None,
66 }
67 }
68
69 pub fn with_capacity(node_cap: usize, text_cap: usize, attr_cap: usize) -> Self {
71 Self {
72 nodes: Vec::with_capacity(node_cap),
73 text_slab: Vec::with_capacity(text_cap),
74 attr_slab: Vec::with_capacity(attr_cap),
75 attr_str_slab: Vec::with_capacity(attr_cap * 32),
76 source: Vec::new(),
77 tag_index: None,
78 }
79 }
80
81 pub fn enable_tag_index(&mut self) {
87 if self.tag_index.is_none() {
88 self.tag_index = Some(Box::new(std::array::from_fn(|_| Vec::new())));
90 }
91 }
92
93 pub fn tag_index(&self) -> Option<&[Vec<NodeId>; TAG_INDEX_SIZE]> {
95 self.tag_index.as_deref()
96 }
97
98 pub fn new_element(&mut self, tag: Tag, depth: u16) -> NodeId {
100 let id = NodeId(self.nodes.len() as u32);
101 self.nodes.push(Node::new_element(tag, depth));
102 if let Some(ref mut idx) = self.tag_index {
104 idx[tag as u8 as usize].push(id);
105 }
106 id
107 }
108
109 pub fn set_unknown_tag_name(&mut self, node: NodeId, tag_name: &str) {
111 if tag_name.is_empty() || self.nodes[node.index()].tag != Tag::Unknown {
112 return;
113 }
114 let offset = self.text_slab.len() as u32;
115 let len = tag_name.len() as u32;
116 self.text_slab.extend_from_slice(tag_name.as_bytes());
117 let n = &mut self.nodes[node.index()];
118 n.text_offset = offset;
119 n.text_len = len;
120 }
121
122 pub fn new_text(&mut self, depth: u16, text: &str) -> NodeId {
124 let offset = self.text_slab.len() as u32;
125 let len = text.len() as u32;
126 self.text_slab.extend_from_slice(text.as_bytes());
127 let id = NodeId(self.nodes.len() as u32);
128 self.nodes.push(Node::new_text(depth, offset, len));
129 id
130 }
131
132 pub fn new_text_ref(&mut self, depth: u16, source_offset: u32, len: u32) -> NodeId {
139 let id = NodeId(self.nodes.len() as u32);
140 let mut node = Node::new_text(depth, source_offset, len);
141 node.flags.set(NodeFlags::IS_TEXT_FROM_SOURCE);
142 self.nodes.push(node);
143 id
144 }
145
146 pub fn set_source(&mut self, input: &str) {
148 self.source = input.as_bytes().to_vec();
149 }
150
151 pub fn set_source_owned(&mut self, source: String) {
156 self.source = source.into_bytes();
157 }
158
159 pub fn new_comment(&mut self, depth: u16, text: &str) -> NodeId {
161 let offset = self.text_slab.len() as u32;
162 let len = text.len() as u32;
163 self.text_slab.extend_from_slice(text.as_bytes());
164 let id = NodeId(self.nodes.len() as u32);
165 self.nodes.push(Node::new_comment(depth, offset, len));
166 id
167 }
168
169 pub fn new_doctype(&mut self, depth: u16, text: &str) -> NodeId {
171 let offset = self.text_slab.len() as u32;
172 let len = text.len() as u32;
173 self.text_slab.extend_from_slice(text.as_bytes());
174 let id = NodeId(self.nodes.len() as u32);
175 self.nodes.push(Node::new_doctype(depth, offset, len));
176 id
177 }
178
179 pub fn set_attrs(&mut self, node: NodeId, attrs: &[fhp_tokenizer::token::Attribute<'_>]) {
181 if attrs.is_empty() {
182 return;
183 }
184 let offset = self.attr_slab.len() as u32;
185 let count = attrs.len().min(255) as u8;
186
187 for attr in &attrs[..count as usize] {
188 let name_offset = self.attr_str_slab.len() as u32;
189 self.attr_str_slab.extend_from_slice(attr.name.as_bytes());
190 let name_len = attr.name.len() as u16;
191
192 let (value_offset, value_len) = if let Some(ref v) = attr.value {
193 let vo = self.attr_str_slab.len() as u32;
194 self.attr_str_slab.extend_from_slice(v.as_bytes());
195 (vo, v.len() as u16)
196 } else {
197 (0, 0)
198 };
199
200 self.attr_slab.push(Attribute {
201 name_offset,
202 name_len,
203 value_offset,
204 value_len,
205 });
206 }
207
208 let n = &mut self.nodes[node.index()];
209 n.attr_offset = offset;
210 n.attr_count = count;
211 n.flags.set(NodeFlags::HAS_ATTRS);
212
213 self.compute_node_hashes(node, offset, count);
215 }
216
217 pub fn set_attrs_raw_lazy(&mut self, node: NodeId, attr_raw: &str) {
224 let trimmed = attr_raw.as_bytes();
225 let has_content = trimmed.iter().any(|&b| !is_attr_whitespace(b));
227 if !has_content {
228 return;
229 }
230 let offset = self.attr_str_slab.len() as u32;
231 self.attr_str_slab.extend_from_slice(trimmed);
232 let n = &mut self.nodes[node.index()];
233 n.attr_raw_offset = offset;
234 n.attr_raw_len = trimmed.len().min(u16::MAX as usize) as u16;
235 n.flags.set(NodeFlags::HAS_ATTRS);
236 }
237
238 pub fn set_attrs_from_raw(&mut self, node: NodeId, attr_raw: &str) {
244 let bytes = attr_raw.as_bytes();
245 let end = bytes.len();
246 if end == 0 {
247 return;
248 }
249
250 let slab_offset = self.attr_slab.len() as u32;
251 let mut count: u8 = 0;
252 let mut pos = 0;
253
254 loop {
255 pos += bytes[pos..end]
257 .iter()
258 .position(|&b| !is_attr_whitespace(b))
259 .unwrap_or(end - pos);
260 if pos >= end || count == 255 {
261 break;
262 }
263
264 let name_start = pos;
266 while pos < end && !is_attr_name_end(bytes[pos]) {
267 pos += 1;
268 }
269 if name_start == pos {
270 pos += 1;
272 continue;
273 }
274
275 let name_slab_offset = self.attr_str_slab.len() as u32;
276 self.attr_str_slab
277 .extend_from_slice(&bytes[name_start..pos]);
278 let name_len = (pos - name_start) as u16;
279
280 pos += bytes[pos..end]
282 .iter()
283 .position(|&b| !is_attr_whitespace(b))
284 .unwrap_or(end - pos);
285
286 if pos < end && bytes[pos] == b'=' {
288 pos += 1;
289
290 pos += bytes[pos..end]
292 .iter()
293 .position(|&b| !is_attr_whitespace(b))
294 .unwrap_or(end - pos);
295
296 if pos < end && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
298 let quote = bytes[pos];
300 pos += 1;
301 let val_start = pos;
302 if let Some(found) = memchr::memchr(quote, &bytes[pos..end]) {
303 pos += found;
304 } else {
305 pos = end;
306 }
307 let val_end = pos;
308 if pos < end {
309 pos += 1; }
311 let raw_value = &attr_raw[val_start..val_end];
312 let (value_offset, value_len) = self.push_attr_value(raw_value);
313 self.attr_slab.push(Attribute {
314 name_offset: name_slab_offset,
315 name_len,
316 value_offset,
317 value_len,
318 });
319 } else {
320 let val_start = pos;
322 while pos < end && !is_attr_whitespace(bytes[pos]) && bytes[pos] != b'>' {
323 pos += 1;
324 }
325 let raw_value = &attr_raw[val_start..pos];
326 let (value_offset, value_len) = self.push_attr_value(raw_value);
327 self.attr_slab.push(Attribute {
328 name_offset: name_slab_offset,
329 name_len,
330 value_offset,
331 value_len,
332 });
333 }
334 } else {
335 self.attr_slab.push(Attribute {
337 name_offset: name_slab_offset,
338 name_len,
339 value_offset: 0,
340 value_len: 0,
341 });
342 }
343
344 count += 1;
345 }
346
347 if count > 0 {
348 let n = &mut self.nodes[node.index()];
349 n.attr_offset = slab_offset;
350 n.attr_count = count;
351 n.flags.set(NodeFlags::HAS_ATTRS);
352
353 self.compute_node_hashes(node, slab_offset, count);
355 }
356 }
357
358 fn compute_node_hashes(&mut self, node: NodeId, slab_offset: u32, count: u8) {
363 let mut class_hash: u64 = 0;
364 let mut id_hash: u32 = 0;
365 let start = slab_offset as usize;
366 let end = start + count as usize;
367
368 for i in start..end {
369 let attr = &self.attr_slab[i];
370 let name_start = attr.name_offset as usize;
371 let name_end = name_start + attr.name_len as usize;
372 let name_bytes = &self.attr_str_slab[name_start..name_end];
373
374 if name_bytes.eq_ignore_ascii_case(b"class") && attr.value_len > 0 {
375 let val_start = attr.value_offset as usize;
376 let val_end = val_start + attr.value_len as usize;
377 let val = &self.attr_str_slab[val_start..val_end];
378 let mut pos = 0;
380 while pos < val.len() {
381 while pos < val.len() && val[pos].is_ascii_whitespace() {
383 pos += 1;
384 }
385 let token_start = pos;
386 while pos < val.len() && !val[pos].is_ascii_whitespace() {
387 pos += 1;
388 }
389 if pos > token_start {
390 class_hash |= class_bloom_bit(&val[token_start..pos]);
391 }
392 }
393 } else if name_bytes.eq_ignore_ascii_case(b"id") && attr.value_len > 0 {
394 let val_start = attr.value_offset as usize;
395 let val_end = val_start + attr.value_len as usize;
396 id_hash = selector_hash(&self.attr_str_slab[val_start..val_end]);
397 }
398 }
399
400 let n = &mut self.nodes[node.index()];
401 n.class_hash = class_hash;
402 n.id_hash = id_hash;
403 }
404
405 #[cfg(feature = "entity-decode")]
407 fn push_attr_value(&mut self, raw_value: &str) -> (u32, u16) {
408 let offset = self.attr_str_slab.len() as u32;
409 let decoded = fhp_tokenizer::entity::decode_entities(raw_value);
410 self.attr_str_slab.extend_from_slice(decoded.as_bytes());
411 (offset, decoded.len() as u16)
412 }
413
414 #[cfg(not(feature = "entity-decode"))]
416 fn push_attr_value(&mut self, raw_value: &str) -> (u32, u16) {
417 let offset = self.attr_str_slab.len() as u32;
418 self.attr_str_slab.extend_from_slice(raw_value.as_bytes());
419 (offset, raw_value.len() as u16)
420 }
421
422 #[inline]
426 pub fn set_element_index(&mut self, node: NodeId, index: u16) {
427 self.nodes[node.index()].element_index = index;
428 }
429
430 pub fn set_self_closing(&mut self, node: NodeId) {
432 self.nodes[node.index()]
433 .flags
434 .set(NodeFlags::IS_SELF_CLOSING);
435 }
436
437 pub fn append_child(&mut self, parent: NodeId, child: NodeId) {
443 let nodes = self.nodes.as_mut_ptr();
444 unsafe {
447 let p = &mut *nodes.add(parent.index());
448 let c = &mut *nodes.add(child.index());
449 let last = p.last_child;
450 c.parent = parent;
451 if last.is_null() {
452 p.first_child = child;
453 } else {
454 (*nodes.add(last.index())).next_sibling = child;
455 c.prev_sibling = last;
456 }
457 p.last_child = child;
458 p.flags.set(NodeFlags::HAS_CHILDREN);
459 }
460 }
461
462 #[inline]
464 pub fn attr_name(&self, attr: &Attribute) -> &str {
465 let start = attr.name_offset as usize;
466 let end = start + attr.name_len as usize;
467 unsafe { std::str::from_utf8_unchecked(&self.attr_str_slab[start..end]) }
469 }
470
471 #[inline]
473 pub fn attr_value(&self, attr: &Attribute) -> Option<&str> {
474 if attr.value_len == 0 {
475 return None;
476 }
477 let start = attr.value_offset as usize;
478 let end = start + attr.value_len as usize;
479 Some(unsafe { std::str::from_utf8_unchecked(&self.attr_str_slab[start..end]) })
481 }
482
483 #[inline]
488 pub fn attrs(&self, node: NodeId) -> &[Attribute] {
489 let n = &self.nodes[node.index()];
490 if n.attr_count == 0 {
491 return &[];
492 }
493 let start = n.attr_offset as usize;
494 let end = start + n.attr_count as usize;
495 &self.attr_slab[start..end]
496 }
497
498 #[inline]
500 pub fn has_lazy_attrs(&self, node: NodeId) -> bool {
501 let n = &self.nodes[node.index()];
502 n.attr_count == 0 && n.attr_raw_len > 0
503 }
504
505 pub fn ensure_attrs_parsed(&mut self, node: NodeId) {
511 let n = &self.nodes[node.index()];
512 if n.attr_count > 0 || n.attr_raw_len == 0 {
513 return;
514 }
515 let raw_offset = n.attr_raw_offset as usize;
516 let raw_len = n.attr_raw_len as usize;
517 let attr_raw = unsafe {
519 std::str::from_utf8_unchecked(&self.attr_str_slab[raw_offset..raw_offset + raw_len])
520 }
521 .to_owned();
522 self.parse_raw_attrs_into_slab(node, &attr_raw);
523 }
524
525 pub fn attrs_mut(&mut self, node: NodeId) -> &[Attribute] {
530 self.ensure_attrs_parsed(node);
531 let n = &self.nodes[node.index()];
532 if n.attr_count == 0 {
533 return &[];
534 }
535 let start = n.attr_offset as usize;
536 let end = start + n.attr_count as usize;
537 &self.attr_slab[start..end]
538 }
539
540 fn parse_raw_attrs_into_slab(&mut self, node: NodeId, attr_raw: &str) {
542 let bytes = attr_raw.as_bytes();
543 let end = bytes.len();
544 if end == 0 {
545 return;
546 }
547
548 let slab_offset = self.attr_slab.len() as u32;
549 let mut count: u8 = 0;
550 let mut pos = 0;
551
552 loop {
553 pos += bytes[pos..end]
554 .iter()
555 .position(|&b| !is_attr_whitespace(b))
556 .unwrap_or(end - pos);
557 if pos >= end || count == 255 {
558 break;
559 }
560
561 let name_start = pos;
562 while pos < end && !is_attr_name_end(bytes[pos]) {
563 pos += 1;
564 }
565 if name_start == pos {
566 pos += 1;
567 continue;
568 }
569
570 let name_slab_offset = self.attr_str_slab.len() as u32;
571 self.attr_str_slab
572 .extend_from_slice(&bytes[name_start..pos]);
573 let name_len = (pos - name_start) as u16;
574
575 pos += bytes[pos..end]
576 .iter()
577 .position(|&b| !is_attr_whitespace(b))
578 .unwrap_or(end - pos);
579
580 if pos < end && bytes[pos] == b'=' {
581 pos += 1;
582
583 pos += bytes[pos..end]
584 .iter()
585 .position(|&b| !is_attr_whitespace(b))
586 .unwrap_or(end - pos);
587
588 if pos < end && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
589 let quote = bytes[pos];
590 pos += 1;
591 let val_start = pos;
592 if let Some(found) = memchr::memchr(quote, &bytes[pos..end]) {
593 pos += found;
594 } else {
595 pos = end;
596 }
597 let val_end = pos;
598 if pos < end {
599 pos += 1;
600 }
601 let raw_value = &attr_raw[val_start..val_end];
602 let (value_offset, value_len) = self.push_attr_value(raw_value);
603 self.attr_slab.push(Attribute {
604 name_offset: name_slab_offset,
605 name_len,
606 value_offset,
607 value_len,
608 });
609 } else {
610 let val_start = pos;
611 while pos < end && !is_attr_whitespace(bytes[pos]) && bytes[pos] != b'>' {
612 pos += 1;
613 }
614 let raw_value = &attr_raw[val_start..pos];
615 let (value_offset, value_len) = self.push_attr_value(raw_value);
616 self.attr_slab.push(Attribute {
617 name_offset: name_slab_offset,
618 name_len,
619 value_offset,
620 value_len,
621 });
622 }
623 } else {
624 self.attr_slab.push(Attribute {
625 name_offset: name_slab_offset,
626 name_len,
627 value_offset: 0,
628 value_len: 0,
629 });
630 }
631
632 count += 1;
633 }
634
635 if count > 0 {
636 let n = &mut self.nodes[node.index()];
637 n.attr_offset = slab_offset;
638 n.attr_count = count;
639
640 self.compute_node_hashes(node, slab_offset, count);
642 }
643 }
644
645 #[inline]
647 pub fn text(&self, node: NodeId) -> &str {
648 let n = &self.nodes[node.index()];
649 if n.text_len == 0 {
650 return "";
651 }
652 let start = n.text_offset as usize;
653 let end = start + n.text_len as usize;
654 if n.flags.has(NodeFlags::IS_TEXT) && n.flags.has(NodeFlags::IS_TEXT_FROM_SOURCE) {
655 unsafe { std::str::from_utf8_unchecked(&self.source[start..end]) }
657 } else {
658 unsafe { std::str::from_utf8_unchecked(&self.text_slab[start..end]) }
660 }
661 }
662
663 #[inline]
665 pub fn unknown_tag_name(&self, node: NodeId) -> Option<&str> {
666 let n = &self.nodes[node.index()];
667 if n.tag != Tag::Unknown || n.text_len == 0 {
668 return None;
669 }
670 let start = n.text_offset as usize;
671 let end = start + n.text_len as usize;
672 Some(unsafe { std::str::from_utf8_unchecked(&self.text_slab[start..end]) })
674 }
675
676 #[inline]
678 pub fn get(&self, id: NodeId) -> &Node {
679 &self.nodes[id.index()]
680 }
681
682 #[inline]
684 pub fn get_mut(&mut self, id: NodeId) -> &mut Node {
685 &mut self.nodes[id.index()]
686 }
687
688 pub fn ensure_all_attrs_parsed(&mut self) {
694 for i in 0..self.nodes.len() {
695 let n = &self.nodes[i];
696 if n.attr_count == 0 && n.attr_raw_len > 0 {
697 let raw_offset = n.attr_raw_offset as usize;
698 let raw_len = n.attr_raw_len as usize;
699 let attr_raw = unsafe {
701 std::str::from_utf8_unchecked(
702 &self.attr_str_slab[raw_offset..raw_offset + raw_len],
703 )
704 }
705 .to_owned();
706 let node_id = NodeId(i as u32);
707 self.parse_raw_attrs_into_slab(node_id, &attr_raw);
708 }
709 }
710 }
711
712 #[inline]
714 pub fn len(&self) -> usize {
715 self.nodes.len()
716 }
717
718 #[inline]
720 pub fn is_empty(&self) -> bool {
721 self.nodes.is_empty()
722 }
723}
724
725impl Default for Arena {
726 fn default() -> Self {
727 Self::new()
728 }
729}
730
731#[inline(always)]
733fn is_attr_whitespace(b: u8) -> bool {
734 matches!(b, b' ' | b'\t' | b'\n' | b'\r')
735}
736
737#[inline(always)]
739fn is_attr_name_end(b: u8) -> bool {
740 matches!(b, b' ' | b'\t' | b'\n' | b'\r' | b'=' | b'/' | b'>')
741}
742
743#[cfg(test)]
744mod tests {
745 use super::*;
746 use std::borrow::Cow;
747
748 #[test]
749 fn new_element_and_get() {
750 let mut arena = Arena::new();
751 let id = arena.new_element(Tag::Div, 0);
752 assert_eq!(id, NodeId(0));
753 assert_eq!(arena.get(id).tag, Tag::Div);
754 assert_eq!(arena.get(id).depth, 0);
755 }
756
757 #[test]
758 fn new_text_and_read() {
759 let mut arena = Arena::new();
760 let id = arena.new_text(1, "hello world");
761 assert!(arena.get(id).flags.has(NodeFlags::IS_TEXT));
762 assert_eq!(arena.text(id), "hello world");
763 }
764
765 #[test]
766 fn append_child_single() {
767 let mut arena = Arena::new();
768 let parent = arena.new_element(Tag::Div, 0);
769 let child = arena.new_element(Tag::Span, 1);
770 arena.append_child(parent, child);
771
772 assert_eq!(arena.get(parent).first_child, child);
773 assert_eq!(arena.get(parent).last_child, child);
774 assert_eq!(arena.get(child).parent, parent);
775 assert!(arena.get(child).next_sibling.is_null());
776 assert!(arena.get(child).prev_sibling.is_null());
777 }
778
779 #[test]
780 fn append_child_multiple() {
781 let mut arena = Arena::new();
782 let parent = arena.new_element(Tag::Div, 0);
783 let c1 = arena.new_element(Tag::Span, 1);
784 let c2 = arena.new_element(Tag::P, 1);
785 let c3 = arena.new_element(Tag::A, 1);
786
787 arena.append_child(parent, c1);
788 arena.append_child(parent, c2);
789 arena.append_child(parent, c3);
790
791 assert_eq!(arena.get(parent).first_child, c1);
792 assert_eq!(arena.get(parent).last_child, c3);
793
794 assert_eq!(arena.get(c1).next_sibling, c2);
795 assert!(arena.get(c1).prev_sibling.is_null());
796
797 assert_eq!(arena.get(c2).prev_sibling, c1);
798 assert_eq!(arena.get(c2).next_sibling, c3);
799
800 assert_eq!(arena.get(c3).prev_sibling, c2);
801 assert!(arena.get(c3).next_sibling.is_null());
802 }
803
804 #[test]
805 fn attrs_roundtrip() {
806 use fhp_tokenizer::token::Attribute as TokAttr;
807
808 let mut arena = Arena::new();
809 let id = arena.new_element(Tag::A, 0);
810
811 let tok_attrs = vec![
812 TokAttr {
813 name: Cow::Borrowed("href"),
814 value: Some(Cow::Borrowed("https://example.com")),
815 },
816 TokAttr {
817 name: Cow::Borrowed("class"),
818 value: Some(Cow::Borrowed("link")),
819 },
820 ];
821 arena.set_attrs(id, &tok_attrs);
822
823 let attrs = arena.attrs(id);
824 assert_eq!(attrs.len(), 2);
825 assert_eq!(arena.attr_name(&attrs[0]), "href");
826 assert_eq!(arena.attr_value(&attrs[0]), Some("https://example.com"));
827 assert_eq!(arena.attr_name(&attrs[1]), "class");
828 assert_eq!(arena.attr_value(&attrs[1]), Some("link"));
829 }
830
831 #[test]
832 fn empty_attrs() {
833 let mut arena = Arena::new();
834 let id = arena.new_element(Tag::Div, 0);
835 assert!(arena.attrs(id).is_empty());
836 }
837
838 #[test]
839 fn arena_len() {
840 let mut arena = Arena::new();
841 assert!(arena.is_empty());
842 arena.new_element(Tag::Div, 0);
843 arena.new_text(1, "hi");
844 assert_eq!(arena.len(), 2);
845 }
846}