1use fhp_core::tag::Tag;
8use fhp_tokenizer::token::Token;
9
10use crate::arena::Arena;
11use crate::node::NodeId;
12
13const MAX_DEPTH: u16 = 512;
15
16const IMPLICIT_CLOSE_SIZE: usize = 8;
24static IMPLICIT_CLOSE: [[bool; IMPLICIT_CLOSE_SIZE]; IMPLICIT_CLOSE_SIZE] = {
25 let mut table = [[false; IMPLICIT_CLOSE_SIZE]; IMPLICIT_CLOSE_SIZE];
26
27 let p = 0usize;
29 let li = 1;
30 let td = 2;
31 let th = 3;
32 let tr = 4;
33 let thead = 5;
34 let tbody = 6;
35 let option = 7;
36
37 table[p][p] = true;
40
41 table[li][li] = true;
43
44 table[td][td] = true;
46 table[td][th] = true;
47
48 table[th][td] = true;
50 table[th][th] = true;
51
52 table[tr][tr] = true;
54
55 table[thead][tbody] = true;
57
58 table[tbody][tbody] = true;
60
61 table[option][option] = true;
63
64 table
65};
66
67fn implicit_close_index(tag: Tag) -> Option<usize> {
70 match tag {
71 Tag::P => Some(0),
72 Tag::Li => Some(1),
73 Tag::Td => Some(2),
74 Tag::Th => Some(3),
75 Tag::Tr => Some(4),
76 Tag::Thead => Some(5),
77 Tag::Tbody => Some(6),
78 _ => None,
80 }
81}
82
83#[inline]
85fn should_implicit_close(open_tag: Tag, new_tag: Tag) -> bool {
86 if let (Some(open_idx), Some(new_idx)) = (
87 implicit_close_index(open_tag),
88 implicit_close_index(new_tag),
89 ) {
90 IMPLICIT_CLOSE[open_idx][new_idx]
91 } else {
92 false
93 }
94}
95
96pub struct TreeBuilder {
101 pub(crate) arena: Arena,
103 open_elements: Vec<(NodeId, Tag, u16)>,
105 root: NodeId,
107 source_base: usize,
109 source_len: usize,
111}
112
113impl TreeBuilder {
114 pub fn new() -> Self {
116 Self::with_capacity_hint(0)
117 }
118
119 pub fn with_capacity_hint(input_len: usize) -> Self {
124 let node_cap = (input_len / 32).max(256);
125 let text_cap = (input_len / 16).max(4096);
127 let attr_cap = (input_len / 128).max(64);
128 let mut arena = Arena::with_capacity(node_cap, text_cap, attr_cap);
129 let root = arena.new_element(Tag::Unknown, 0);
131 let mut open_elements = Vec::with_capacity(32);
132 open_elements.push((root, Tag::Unknown, 0));
133 Self {
134 arena,
135 open_elements,
136 root,
137 source_base: 0,
138 source_len: 0,
139 }
140 }
141
142 pub fn enable_tag_index(&mut self) {
148 self.arena.enable_tag_index();
149 }
150
151 pub fn set_source(&mut self, input: &str) {
157 self.source_base = input.as_ptr() as usize;
158 self.source_len = input.len();
159 self.arena.set_source(input);
160 }
161
162 pub fn set_source_ptr(&mut self, input: &str) {
167 self.source_base = input.as_ptr() as usize;
168 self.source_len = input.len();
169 }
170
171 #[inline]
176 pub fn process(&mut self, token: &Token<'_>) -> Option<NodeId> {
177 match token {
178 Token::OpenTag {
179 tag,
180 name,
181 attributes,
182 self_closing,
183 ..
184 } => self.handle_open_tag(*tag, name.as_ref(), attributes, *self_closing),
185 Token::CloseTag { tag, name } => {
186 self.handle_close_tag(*tag, name.as_ref());
187 None
188 }
189 Token::Text { content } => self.handle_text(content.as_ref()),
190 Token::Comment { content } => self.handle_comment(content.as_ref()),
191 Token::Doctype { content } => self.handle_doctype(content.as_ref()),
192 Token::CData { content } => {
193 self.handle_text(content.as_ref())
195 }
196 }
197 }
198
199 pub fn finish(self) -> (Arena, NodeId) {
201 (self.arena, self.root)
203 }
204
205 #[inline]
207 fn current_parent(&self) -> NodeId {
208 self.open_elements
209 .last()
210 .map(|&(id, _, _)| id)
211 .unwrap_or(self.root)
212 }
213
214 #[inline]
216 fn current_depth(&self) -> u16 {
217 (self.open_elements.len() as u16).min(MAX_DEPTH)
218 }
219
220 fn handle_open_tag(
222 &mut self,
223 tag: Tag,
224 name: &str,
225 attributes: &[fhp_tokenizer::token::Attribute<'_>],
226 self_closing: bool,
227 ) -> Option<NodeId> {
228 self.apply_implicit_close(tag);
230
231 if self.current_depth() >= MAX_DEPTH {
233 return None;
234 }
235
236 let depth = self.current_depth();
237 let parent = self.current_parent();
238 let node = self.arena.new_element(tag, depth);
239 if tag == Tag::Unknown {
240 self.arena.set_unknown_tag_name(node, name);
241 }
242
243 if !attributes.is_empty() {
245 self.arena.set_attrs(node, attributes);
246 }
247
248 self.arena.append_child(parent, node);
250 if let Some(parent_entry) = self.open_elements.last_mut() {
251 parent_entry.2 += 1;
252 self.arena.set_element_index(node, parent_entry.2);
253 }
254
255 if tag.is_void() || self_closing {
257 if self_closing {
258 self.arena.set_self_closing(node);
259 }
260 if tag.is_void() {
262 self.arena.set_self_closing(node);
263 }
264 } else {
265 self.open_elements.push((node, tag, 0));
266 }
267
268 Some(node)
269 }
270
271 fn handle_close_tag(&mut self, tag: Tag, name: &str) {
273 if tag.is_void() {
275 return;
276 }
277
278 let mut match_idx = None;
281 for i in (1..self.open_elements.len()).rev() {
282 let (open_id, open_tag, _) = self.open_elements[i];
283 if open_tag == tag {
284 if tag == Tag::Unknown {
285 let open_name = self.arena.unknown_tag_name(open_id).unwrap_or("");
286 if !open_name.eq_ignore_ascii_case(name) {
287 continue;
288 }
289 }
290 match_idx = Some(i);
291 break;
292 }
293 }
294
295 if let Some(idx) = match_idx {
296 self.open_elements.truncate(idx);
298 }
299 }
301
302 fn handle_text(&mut self, content: &str) -> Option<NodeId> {
307 if content.is_empty() {
308 return None;
309 }
310 let depth = self.current_depth();
311 let parent = self.current_parent();
312 let node = self.try_source_ref(depth, content);
313 self.arena.append_child(parent, node);
314 Some(node)
315 }
316
317 fn handle_raw_text(&mut self, raw: &str) -> Option<NodeId> {
322 if raw.is_empty() {
323 return None;
324 }
325 let depth = self.current_depth();
326 let parent = self.current_parent();
327
328 #[cfg(feature = "entity-decode")]
329 let node = {
330 let decoded = fhp_tokenizer::entity::decode_entities(raw);
331 match decoded {
332 std::borrow::Cow::Borrowed(s) => self.try_source_ref(depth, s),
333 std::borrow::Cow::Owned(s) => self.arena.new_text(depth, &s),
334 }
335 };
336
337 #[cfg(not(feature = "entity-decode"))]
338 let node = self.try_source_ref(depth, raw);
339
340 self.arena.append_child(parent, node);
341 Some(node)
342 }
343
344 #[inline]
347 fn try_source_ref(&mut self, depth: u16, content: &str) -> NodeId {
348 if self.source_len > 0 {
349 let ptr = content.as_ptr() as usize;
350 if ptr >= self.source_base && ptr + content.len() <= self.source_base + self.source_len
351 {
352 let offset = ptr - self.source_base;
353 return self
354 .arena
355 .new_text_ref(depth, offset as u32, content.len() as u32);
356 }
357 }
358 self.arena.new_text(depth, content)
359 }
360
361 fn handle_comment(&mut self, content: &str) -> Option<NodeId> {
363 let depth = self.current_depth();
364 let parent = self.current_parent();
365 let node = self.arena.new_comment(depth, content);
366 self.arena.append_child(parent, node);
367 Some(node)
368 }
369
370 fn handle_doctype(&mut self, content: &str) -> Option<NodeId> {
372 let depth = self.current_depth();
373 let parent = self.current_parent();
374 let node = self.arena.new_doctype(depth, content);
375 self.arena.append_child(parent, node);
376 Some(node)
377 }
378
379 fn apply_implicit_close(&mut self, new_tag: Tag) {
381 while self.open_elements.len() > 1 {
384 let (_, current_tag, _) = *self.open_elements.last().unwrap();
385
386 if should_implicit_close(current_tag, new_tag) {
387 self.open_elements.pop();
388 } else {
389 break;
390 }
391 }
392 }
393}
394
395impl fhp_tokenizer::TreeSink for TreeBuilder {
396 fn open_tag(&mut self, tag: Tag, name: &str, attr_raw: &str, self_closing: bool) {
397 self.apply_implicit_close(tag);
398
399 if self.current_depth() >= MAX_DEPTH {
400 return;
401 }
402
403 let depth = self.current_depth();
404 let parent = self.current_parent();
405 let node = self.arena.new_element(tag, depth);
406 if tag == Tag::Unknown {
407 self.arena.set_unknown_tag_name(node, name);
408 }
409
410 self.arena.set_attrs_from_raw(node, attr_raw);
412
413 self.arena.append_child(parent, node);
414 if let Some(parent_entry) = self.open_elements.last_mut() {
415 parent_entry.2 += 1;
416 self.arena.set_element_index(node, parent_entry.2);
417 }
418
419 if tag.is_void() || self_closing {
420 if self_closing {
421 self.arena.set_self_closing(node);
422 }
423 if tag.is_void() {
424 self.arena.set_self_closing(node);
425 }
426 } else {
427 self.open_elements.push((node, tag, 0));
428 }
429 }
430
431 fn close_tag(&mut self, tag: Tag, name: &str) {
432 self.handle_close_tag(tag, name);
433 }
434
435 fn text(&mut self, raw: &str) {
436 self.handle_raw_text(raw);
437 }
438
439 fn comment(&mut self, content: &str) {
440 self.handle_comment(content);
441 }
442
443 fn doctype(&mut self, content: &str) {
444 self.handle_doctype(content);
445 }
446
447 fn cdata(&mut self, content: &str) {
448 self.handle_raw_text(content);
450 }
451}
452
453impl Default for TreeBuilder {
454 fn default() -> Self {
455 Self::new()
456 }
457}
458
459#[cfg(test)]
460mod tests {
461 use super::*;
462 use crate::node::NodeFlags;
463 use std::borrow::Cow;
464
465 fn make_open(tag: Tag) -> Token<'static> {
466 Token::OpenTag {
467 tag,
468 name: Cow::Borrowed(tag.as_str().unwrap_or("unknown")),
469 attributes: vec![],
470 self_closing: false,
471 }
472 }
473
474 fn make_close(tag: Tag) -> Token<'static> {
475 Token::CloseTag {
476 tag,
477 name: Cow::Borrowed(tag.as_str().unwrap_or("unknown")),
478 }
479 }
480
481 fn make_text(content: &'static str) -> Token<'static> {
482 Token::Text {
483 content: Cow::Borrowed(content),
484 }
485 }
486
487 #[test]
488 fn simple_tree() {
489 let mut builder = TreeBuilder::new();
490 builder.process(&make_open(Tag::Div));
491 builder.process(&make_text("hello"));
492 builder.process(&make_close(Tag::Div));
493
494 let (arena, root) = builder.finish();
495
496 let div = arena.get(root).first_child;
498 assert!(!div.is_null());
499 assert_eq!(arena.get(div).tag, Tag::Div);
500
501 let text = arena.get(div).first_child;
503 assert!(!text.is_null());
504 assert!(arena.get(text).flags.has(NodeFlags::IS_TEXT));
505 assert_eq!(arena.text(text), "hello");
506 }
507
508 #[test]
509 fn void_element_not_pushed() {
510 let mut builder = TreeBuilder::new();
511 builder.process(&make_open(Tag::Div));
512 builder.process(&Token::OpenTag {
513 tag: Tag::Br,
514 name: Cow::Borrowed("br"),
515 attributes: vec![],
516 self_closing: false,
517 });
518 builder.process(&make_text("after br"));
519 builder.process(&make_close(Tag::Div));
520
521 let (arena, root) = builder.finish();
522 let div = arena.get(root).first_child;
523 let br = arena.get(div).first_child;
524 assert_eq!(arena.get(br).tag, Tag::Br);
525
526 let text = arena.get(br).next_sibling;
528 assert!(!text.is_null());
529 assert_eq!(arena.text(text), "after br");
530 }
531
532 #[test]
533 fn implicit_close_p() {
534 let mut builder = TreeBuilder::new();
535 builder.process(&make_open(Tag::P));
536 builder.process(&make_text("first"));
537 builder.process(&make_open(Tag::P));
538 builder.process(&make_text("second"));
539 builder.process(&make_close(Tag::P));
540
541 let (arena, root) = builder.finish();
542
543 let p1 = arena.get(root).first_child;
545 assert_eq!(arena.get(p1).tag, Tag::P);
546
547 let p2 = arena.get(p1).next_sibling;
548 assert!(!p2.is_null());
549 assert_eq!(arena.get(p2).tag, Tag::P);
550
551 assert_eq!(arena.text(arena.get(p1).first_child), "first");
553 assert_eq!(arena.text(arena.get(p2).first_child), "second");
554 }
555
556 #[test]
557 fn mismatched_close_finds_nearest() {
558 let mut builder = TreeBuilder::new();
560 builder.process(&make_open(Tag::Div));
561 builder.process(&make_open(Tag::Span));
562 builder.process(&make_text("hi"));
563 builder.process(&make_close(Tag::Div));
564
565 let (arena, root) = builder.finish();
566 let div = arena.get(root).first_child;
567 assert_eq!(arena.get(div).tag, Tag::Div);
568 }
569
570 #[test]
571 fn extra_close_tag_ignored() {
572 let mut builder = TreeBuilder::new();
573 builder.process(&make_close(Tag::Div)); builder.process(&make_open(Tag::P));
575 builder.process(&make_text("ok"));
576 builder.process(&make_close(Tag::P));
577
578 let (arena, root) = builder.finish();
579 let p = arena.get(root).first_child;
580 assert_eq!(arena.get(p).tag, Tag::P);
581 }
582
583 #[test]
584 fn unknown_close_matches_by_name() {
585 let mut builder = TreeBuilder::new();
586 builder.process(&Token::OpenTag {
587 tag: Tag::Unknown,
588 name: Cow::Borrowed("my-widget"),
589 attributes: vec![],
590 self_closing: false,
591 });
592 builder.process(&Token::OpenTag {
593 tag: Tag::Unknown,
594 name: Cow::Borrowed("x-item"),
595 attributes: vec![],
596 self_closing: false,
597 });
598 builder.process(&Token::CloseTag {
599 tag: Tag::Unknown,
600 name: Cow::Borrowed("my-widget"),
601 });
602
603 let (arena, root) = builder.finish();
604 let my_widget = arena.get(root).first_child;
605 let x_item = arena.get(my_widget).first_child;
606 assert_eq!(arena.unknown_tag_name(my_widget), Some("my-widget"));
607 assert_eq!(arena.unknown_tag_name(x_item), Some("x-item"));
608 }
609
610 #[test]
611 fn should_implicit_close_rules() {
612 assert!(should_implicit_close(Tag::P, Tag::P));
613 assert!(should_implicit_close(Tag::Li, Tag::Li));
614 assert!(should_implicit_close(Tag::Td, Tag::Td));
615 assert!(should_implicit_close(Tag::Td, Tag::Th));
616 assert!(should_implicit_close(Tag::Th, Tag::Td));
617 assert!(should_implicit_close(Tag::Tr, Tag::Tr));
618
619 assert!(!should_implicit_close(Tag::Div, Tag::Div));
620 assert!(!should_implicit_close(Tag::Span, Tag::Span));
621 assert!(!should_implicit_close(Tag::P, Tag::Span));
622 }
623}