1pub mod arena;
19#[cfg(feature = "async-tokio")]
21pub mod async_parser;
22pub mod builder;
24pub mod node;
26pub mod streaming;
28pub mod traverse;
30
31use fhp_core::tag::Tag;
32
33use arena::{Arena, Attribute};
34use builder::TreeBuilder;
35use node::{NodeFlags, NodeId};
36use traverse::{Ancestors, BreadthFirst, Children, DepthFirst, Siblings};
37
38#[derive(Debug, thiserror::Error)]
40pub enum HtmlError {
41 #[error("input too large: {size} bytes (max {max})")]
43 InputTooLarge {
44 size: usize,
46 max: usize,
48 },
49
50 #[error("encoding error: {0}")]
52 Encoding(#[from] fhp_core::error::EncodingError),
53
54 #[error("I/O error: {0}")]
56 Io(#[from] std::io::Error),
57}
58
59pub(crate) const MAX_INPUT_SIZE: usize = 256 * 1024 * 1024;
61
62pub fn parse(input: &str) -> Result<Document, HtmlError> {
80 if input.len() > MAX_INPUT_SIZE {
81 return Err(HtmlError::InputTooLarge {
82 size: input.len(),
83 max: MAX_INPUT_SIZE,
84 });
85 }
86
87 let mut builder = TreeBuilder::with_capacity_hint(input.len());
88 builder.set_source(input);
89 fhp_tokenizer::tokenize_into(input, &mut builder);
90 let (arena, root) = builder.finish();
91
92 Ok(Document { arena, root })
93}
94
95pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
116 if input.len() > MAX_INPUT_SIZE {
117 return Err(HtmlError::InputTooLarge {
118 size: input.len(),
119 max: MAX_INPUT_SIZE,
120 });
121 }
122
123 let mut builder = TreeBuilder::with_capacity_hint(input.len());
124 builder.set_source_ptr(&input);
126 fhp_tokenizer::tokenize_into(&input, &mut builder);
127 let (mut arena, root) = builder.finish();
128 arena.set_source_owned(input);
130
131 Ok(Document { arena, root })
132}
133
134pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
157 if input.len() > MAX_INPUT_SIZE {
158 return Err(HtmlError::InputTooLarge {
159 size: input.len(),
160 max: MAX_INPUT_SIZE,
161 });
162 }
163
164 let (text, _encoding) = fhp_encoding::decode_or_detect(input)?;
165 parse(&text)
166}
167
168pub struct Document {
173 arena: Arena,
174 root: NodeId,
175}
176
177impl Document {
178 pub fn root(&self) -> NodeRef<'_> {
180 NodeRef {
181 arena: &self.arena,
182 id: self.root,
183 }
184 }
185
186 pub fn get(&self, id: NodeId) -> NodeRef<'_> {
192 NodeRef {
193 arena: &self.arena,
194 id,
195 }
196 }
197
198 pub fn arena(&self) -> &Arena {
200 &self.arena
201 }
202
203 pub fn to_html(&self) -> String {
218 self.root().outer_html()
219 }
220
221 pub fn node_count(&self) -> usize {
223 self.arena.len()
224 }
225
226 pub fn root_id(&self) -> NodeId {
228 self.root
229 }
230}
231
232#[derive(Clone, Copy)]
237pub struct NodeRef<'a> {
238 arena: &'a Arena,
239 id: NodeId,
240}
241
242impl<'a> NodeRef<'a> {
243 pub fn id(&self) -> NodeId {
245 self.id
246 }
247
248 pub fn tag(&self) -> Tag {
250 self.arena.get(self.id).tag
251 }
252
253 pub fn depth(&self) -> u16 {
255 self.arena.get(self.id).depth
256 }
257
258 pub fn is_text(&self) -> bool {
260 self.arena.get(self.id).flags.has(NodeFlags::IS_TEXT)
261 }
262
263 pub fn is_comment(&self) -> bool {
265 self.arena.get(self.id).flags.has(NodeFlags::IS_COMMENT)
266 }
267
268 pub fn is_doctype(&self) -> bool {
270 self.arena.get(self.id).flags.has(NodeFlags::IS_DOCTYPE)
271 }
272
273 pub fn is_void(&self) -> bool {
275 self.arena.get(self.id).flags.has(NodeFlags::IS_VOID)
276 }
277
278 pub fn has_children(&self) -> bool {
280 !self.arena.get(self.id).first_child.is_null()
281 }
282
283 pub fn text(&self) -> &'a str {
287 let node = self.arena.get(self.id);
288 if node.flags.has(NodeFlags::IS_TEXT)
289 || node.flags.has(NodeFlags::IS_COMMENT)
290 || node.flags.has(NodeFlags::IS_DOCTYPE)
291 {
292 self.arena.text(self.id)
293 } else {
294 ""
295 }
296 }
297
298 pub fn text_content(&self) -> String {
301 let node = self.arena.get(self.id);
302 if node.flags.has(NodeFlags::IS_TEXT) {
304 return self.arena.text(self.id).to_string();
305 }
306 let hint = (self.arena.text_slab.len() / 4).min(4096);
308 let mut result = String::with_capacity(hint);
309 self.collect_text(&mut result);
310 result
311 }
312
313 fn collect_text(&self, out: &mut String) {
315 let node = self.arena.get(self.id);
316 if node.flags.has(NodeFlags::IS_TEXT) {
317 out.push_str(self.arena.text(self.id));
318 return;
319 }
320 let mut child = node.first_child;
321 while !child.is_null() {
322 NodeRef {
323 arena: self.arena,
324 id: child,
325 }
326 .collect_text(out);
327 child = self.arena.get(child).next_sibling;
328 }
329 }
330
331 pub fn inner_html(&self) -> String {
333 let mut result = String::new();
334 let node = self.arena.get(self.id);
335 let mut child = node.first_child;
336 while !child.is_null() {
337 NodeRef {
338 arena: self.arena,
339 id: child,
340 }
341 .write_outer_html(&mut result);
342 child = self.arena.get(child).next_sibling;
343 }
344 result
345 }
346
347 pub fn outer_html(&self) -> String {
349 let mut result = String::new();
350 self.write_outer_html(&mut result);
351 result
352 }
353
354 fn write_outer_html(&self, out: &mut String) {
356 let node = self.arena.get(self.id);
357
358 if node.flags.has(NodeFlags::IS_TEXT) {
359 let text = self.arena.text(self.id);
360 let parent_id = node.parent;
362 let is_raw_text = !parent_id.is_null() && self.arena.get(parent_id).tag.is_raw_text();
363 if is_raw_text {
364 out.push_str(text);
365 } else {
366 fhp_core::entity::escape_text(text, out);
367 }
368 return;
369 }
370
371 if node.flags.has(NodeFlags::IS_COMMENT) {
372 out.push_str("<!--");
373 out.push_str(self.arena.text(self.id));
374 out.push_str("-->");
375 return;
376 }
377
378 if node.flags.has(NodeFlags::IS_DOCTYPE) {
379 out.push_str("<!DOCTYPE ");
380 out.push_str(self.arena.text(self.id));
381 out.push('>');
382 return;
383 }
384
385 let tag_name = node
386 .tag
387 .as_str()
388 .or_else(|| self.arena.unknown_tag_name(self.id));
389 let is_root_wrapper = node.depth == 0 && node.parent.is_null();
391
392 if !is_root_wrapper {
393 if let Some(name) = tag_name {
394 out.push('<');
395 out.push_str(name);
396
397 let attrs = self.arena.attrs(self.id);
399 for attr in attrs {
400 out.push(' ');
401 out.push_str(self.arena.attr_name(attr));
402 if let Some(val) = self.arena.attr_value(attr) {
403 out.push_str("=\"");
404 fhp_core::entity::escape_attr(val, out);
405 out.push('"');
406 }
407 }
408
409 if node.flags.has(NodeFlags::IS_VOID) {
410 out.push('>');
411 return;
412 }
413 out.push('>');
414 }
415 }
416
417 let mut child = node.first_child;
419 while !child.is_null() {
420 NodeRef {
421 arena: self.arena,
422 id: child,
423 }
424 .write_outer_html(out);
425 child = self.arena.get(child).next_sibling;
426 }
427
428 if !is_root_wrapper {
430 if let Some(name) = tag_name {
431 out.push_str("</");
432 out.push_str(name);
433 out.push('>');
434 }
435 }
436 }
437
438 pub fn attr(&self, name: &str) -> Option<&'a str> {
440 self.arena
441 .attrs(self.id)
442 .iter()
443 .find(|a| self.arena.attr_name(a).eq_ignore_ascii_case(name))
444 .and_then(|a| self.arena.attr_value(a))
445 }
446
447 pub fn has_class(&self, class_name: &str) -> bool {
452 if let Some(classes) = self.attr("class") {
453 classes.split_whitespace().any(|c| c == class_name)
454 } else {
455 false
456 }
457 }
458
459 pub fn attrs(&self) -> &'a [Attribute] {
461 self.arena.attrs(self.id)
462 }
463
464 pub fn children(&self) -> Children<'a> {
466 Children::new(self.arena, self.id)
467 }
468
469 pub fn parent(&self) -> Option<NodeRef<'a>> {
471 let parent = self.arena.get(self.id).parent;
472 if parent.is_null() {
473 None
474 } else {
475 Some(NodeRef {
476 arena: self.arena,
477 id: parent,
478 })
479 }
480 }
481
482 pub fn first_child(&self) -> Option<NodeRef<'a>> {
484 let fc = self.arena.get(self.id).first_child;
485 if fc.is_null() {
486 None
487 } else {
488 Some(NodeRef {
489 arena: self.arena,
490 id: fc,
491 })
492 }
493 }
494
495 pub fn next_sibling(&self) -> Option<NodeRef<'a>> {
497 let ns = self.arena.get(self.id).next_sibling;
498 if ns.is_null() {
499 None
500 } else {
501 Some(NodeRef {
502 arena: self.arena,
503 id: ns,
504 })
505 }
506 }
507
508 pub fn prev_sibling(&self) -> Option<NodeRef<'a>> {
510 let ps = self.arena.get(self.id).prev_sibling;
511 if ps.is_null() {
512 None
513 } else {
514 Some(NodeRef {
515 arena: self.arena,
516 id: ps,
517 })
518 }
519 }
520
521 pub fn ancestors(&self) -> Ancestors<'a> {
523 Ancestors::new(self.arena, self.id)
524 }
525
526 pub fn siblings(&self) -> Siblings<'a> {
528 Siblings::new(self.arena, self.id)
529 }
530
531 pub fn descendants(&self) -> DepthFirst<'a> {
533 DepthFirst::new(self.arena, self.id)
534 }
535
536 pub fn descendants_bfs(&self) -> BreadthFirst<'a> {
538 BreadthFirst::new(self.arena, self.id)
539 }
540}
541
542impl<'a> core::fmt::Debug for NodeRef<'a> {
543 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
544 let node = self.arena.get(self.id);
545 if node.flags.has(NodeFlags::IS_TEXT) {
546 write!(f, "Text({:?})", self.text())
547 } else if node.flags.has(NodeFlags::IS_COMMENT) {
548 write!(f, "Comment({:?})", self.text())
549 } else {
550 write!(f, "<{}>", node.tag)
551 }
552 }
553}
554
555#[cfg(test)]
556mod tests {
557 use super::*;
558
559 #[test]
560 fn parse_simple() {
561 let doc = parse("<div><p>Hello</p></div>").unwrap();
562 assert!(doc.node_count() > 0);
563 let root = doc.root();
564 assert!(root.has_children());
565 }
566
567 #[test]
568 fn parse_text_content() {
569 let doc = parse("<div><span>Hello</span> <span>World</span></div>").unwrap();
570 let root = doc.root();
571 let text = root.text_content();
572 assert!(text.contains("Hello"), "text: {text}");
573 assert!(text.contains("World"), "text: {text}");
574 }
575
576 #[test]
577 fn parse_attr() {
578 let doc = parse("<a href=\"https://example.com\" class=\"link primary\">text</a>").unwrap();
579 let root = doc.root();
580 let a = root.first_child().expect("should have child");
582 assert_eq!(a.tag(), Tag::A);
583 assert_eq!(a.attr("href"), Some("https://example.com"));
584 assert!(a.has_class("link"));
585 assert!(a.has_class("primary"));
586 assert!(!a.has_class("secondary"));
587 }
588
589 #[test]
590 fn parse_attr_case_insensitive_name_lookup() {
591 let doc = parse("<a HREF=\"https://example.com\" CLASS=\"link primary\">text</a>").unwrap();
592 let root = doc.root();
593 let a = root.first_child().expect("should have child");
594
595 assert_eq!(a.attr("href"), Some("https://example.com"));
596 assert_eq!(a.attr("HREF"), Some("https://example.com"));
597 assert!(a.has_class("link"));
598 assert!(a.has_class("primary"));
599 }
600
601 #[test]
602 fn parse_inner_html() {
603 let doc = parse("<div><p>Hello</p></div>").unwrap();
604 let root = doc.root();
605 let div = root.first_child().unwrap();
606 assert_eq!(div.tag(), Tag::Div);
607 let inner = div.inner_html();
608 assert!(inner.contains("<p>"), "inner: {inner}");
609 assert!(inner.contains("Hello"), "inner: {inner}");
610 assert!(inner.contains("</p>"), "inner: {inner}");
611 }
612
613 #[test]
614 fn parse_outer_html() {
615 let doc = parse("<div><p>Hello</p></div>").unwrap();
616 let root = doc.root();
617 let div = root.first_child().unwrap();
618 let outer = div.outer_html();
619 assert!(outer.starts_with("<div>"), "outer: {outer}");
620 assert!(outer.ends_with("</div>"), "outer: {outer}");
621 }
622
623 #[test]
624 fn parse_void_elements() {
625 let doc = parse("<div><br><hr></div>").unwrap();
626 let root = doc.root();
627 let div = root.first_child().unwrap();
628 let children: Vec<_> = div.children().collect();
629 assert_eq!(children.len(), 2);
630
631 let br_ref = doc.get(children[0]);
632 assert_eq!(br_ref.tag(), Tag::Br);
633 assert!(br_ref.is_void());
634
635 let hr_ref = doc.get(children[1]);
636 assert_eq!(hr_ref.tag(), Tag::Hr);
637 assert!(hr_ref.is_void());
638 }
639
640 #[test]
641 fn parse_depth_first() {
642 let doc = parse("<div><span>a</span><p>b</p></div>").unwrap();
643 let root = doc.root();
644 let tags: Vec<_> = root
645 .descendants()
646 .map(|id| doc.get(id))
647 .filter(|n| !n.is_text())
648 .map(|n| n.tag())
649 .collect();
650 assert!(tags.contains(&Tag::Div));
652 assert!(tags.contains(&Tag::Span));
653 assert!(tags.contains(&Tag::P));
654 }
655
656 #[test]
657 fn parse_ancestors() {
658 let doc = parse("<div><span><a>link</a></span></div>").unwrap();
659 let root = doc.root();
660
661 let div = root.first_child().unwrap();
663 let span = div.first_child().unwrap();
664 let a = span.first_child().unwrap();
665
666 let ancestor_tags: Vec<_> = a.ancestors().map(|id| doc.get(id).tag()).collect();
667 assert_eq!(ancestor_tags, vec![Tag::Span, Tag::Div, Tag::Unknown]);
668 }
669
670 #[test]
671 fn parse_siblings() {
672 let doc = parse("<ul><li>1</li><li>2</li><li>3</li></ul>").unwrap();
673 let root = doc.root();
674 let ul = root.first_child().unwrap();
675 let li1 = ul.first_child().unwrap();
676
677 let sibling_count = li1.siblings().count();
678 assert_eq!(sibling_count, 2);
679 }
680
681 #[test]
682 fn empty_input() {
683 let doc = parse("").unwrap();
684 assert!(!doc.root().has_children());
685 }
686
687 #[test]
688 fn text_only() {
689 let doc = parse("just text").unwrap();
690 assert_eq!(doc.root().text_content(), "just text");
691 }
692
693 #[test]
694 fn broken_html_unclosed() {
695 let doc = parse("<div><p>unclosed").unwrap();
696 let root = doc.root();
697 assert!(root.has_children());
698 assert_eq!(root.text_content(), "unclosed");
699 }
700
701 #[test]
702 fn broken_html_extra_close() {
703 let doc = parse("</div><p>ok</p>").unwrap();
704 let root = doc.root();
705 assert_eq!(root.text_content(), "ok");
706 }
707
708 #[test]
709 fn implicit_close_p_p() {
710 let doc = parse("<p>first<p>second").unwrap();
711 let root = doc.root();
712 let children: Vec<_> = root.children().collect();
713 let p_count = children
715 .iter()
716 .filter(|&c| doc.get(*c).tag() == Tag::P)
717 .count();
718 assert_eq!(p_count, 2, "both <p> should be root children");
719 }
720
721 #[test]
722 fn node_64_bytes_alignment() {
723 assert_eq!(std::mem::size_of::<node::Node>(), 64);
724 assert_eq!(std::mem::align_of::<node::Node>(), 64);
725 }
726
727 #[test]
728 fn input_too_large() {
729 let result = parse("");
731 assert!(result.is_ok());
732 }
733
734 #[test]
735 fn comment_and_doctype() {
736 let doc = parse("<!DOCTYPE html><!-- comment --><div>ok</div>").unwrap();
737 let root = doc.root();
738 let mut has_comment = false;
739 let mut has_doctype = false;
740 for child_id in root.children() {
741 let child = doc.get(child_id);
742 if child.is_comment() {
743 has_comment = true;
744 }
745 if child.is_doctype() {
746 has_doctype = true;
747 }
748 }
749 assert!(has_doctype, "should have doctype");
750 assert!(has_comment, "should have comment");
751 }
752
753 #[test]
754 fn void_outer_html() {
755 let doc = parse("<br>").unwrap();
756 let root = doc.root();
757 let br = root.first_child().unwrap();
758 let html = br.outer_html();
759 assert_eq!(html, "<br>", "outer: {html}");
760 }
761
762 #[test]
763 fn unknown_tag_outer_html_preserved() {
764 let doc = parse("<my-widget><x-item>ok</x-item></my-widget>").unwrap();
765 let root = doc.root();
766 let outer = root.inner_html();
767 assert_eq!(outer, "<my-widget><x-item>ok</x-item></my-widget>");
768 }
769
770 #[test]
773 fn parse_bytes_utf8() {
774 let doc = parse_bytes(b"<div><p>Hello</p></div>").unwrap();
775 assert_eq!(doc.root().text_content(), "Hello");
776 }
777
778 #[test]
779 fn parse_bytes_utf8_bom() {
780 let html = b"\xEF\xBB\xBF<div><p>BOM test</p></div>";
781 let doc = parse_bytes(html).unwrap();
782 assert!(doc.root().text_content().contains("BOM test"));
783 }
784
785 #[test]
786 fn parse_bytes_windows_1254_meta() {
787 let html = b"<meta charset=\"windows-1254\"><p>Merhaba d\xFCnya</p>";
789 let doc = parse_bytes(html).unwrap();
790 let text = doc.root().text_content();
791 assert!(text.contains("dünya"), "text: {text}");
792 }
793
794 #[test]
795 fn parse_bytes_utf16le_bom() {
796 let mut bytes = vec![0xFF, 0xFE]; for &ch in b"<p>UTF16</p>" {
798 bytes.push(ch);
799 bytes.push(0x00);
800 }
801 let doc = parse_bytes(&bytes).unwrap();
802 let text = doc.root().text_content();
803 assert!(text.contains("UTF16"), "text: {text}");
804 }
805
806 #[test]
807 fn parse_bytes_empty() {
808 let doc = parse_bytes(b"").unwrap();
809 assert!(!doc.root().has_children());
810 }
811
812 #[test]
815 fn text_escaping_in_inner_html() {
816 let doc = parse("<p>1 < 2 & 3 > 0</p>").unwrap();
818 let p = doc.root().first_child().unwrap();
819 assert_eq!(p.text_content(), "1 < 2 & 3 > 0");
820 let inner = p.inner_html();
821 assert_eq!(inner, "1 < 2 & 3 > 0");
822 }
823
824 #[test]
825 fn attr_escaping_in_outer_html() {
826 let doc = parse("<a href=\"x&y\">link</a>").unwrap();
827 let a = doc.root().first_child().unwrap();
828 let outer = a.outer_html();
829 assert!(
830 outer.contains("x&y"),
831 "attribute value should be escaped: {outer}"
832 );
833 }
834
835 #[test]
836 fn script_raw_text_not_escaped() {
837 let doc = parse("<script>if (a < b && c > d) {}</script>").unwrap();
838 let script = doc.root().first_child().unwrap();
839 let inner = script.inner_html();
840 assert_eq!(inner, "if (a < b && c > d) {}");
841 }
842
843 #[test]
844 fn style_raw_text_not_escaped() {
845 let doc = parse("<style>a > b { color: red; }</style>").unwrap();
846 let style = doc.root().first_child().unwrap();
847 let inner = style.inner_html();
848 assert_eq!(inner, "a > b { color: red; }");
849 }
850
851 #[test]
852 fn void_elements_no_closing_slash() {
853 let doc = parse("<div><br><img src=\"x.png\"><hr></div>").unwrap();
854 let div = doc.root().first_child().unwrap();
855 let inner = div.inner_html();
856 assert!(inner.contains("<br>"), "br: {inner}");
857 assert!(inner.contains("<img "), "img: {inner}");
858 assert!(inner.contains("<hr>"), "hr: {inner}");
859 assert!(!inner.contains("/>"), "should not contain />: {inner}");
860 }
861
862 #[test]
863 fn comment_not_escaped() {
864 let doc = parse("<!-- <b>not bold</b> & stuff -->").unwrap();
865 let html = doc.to_html();
866 assert!(
867 html.contains("<!-- <b>not bold</b> & stuff -->"),
868 "comment should be verbatim: {html}"
869 );
870 }
871
872 #[test]
873 fn doctype_not_escaped() {
874 let doc = parse("<!DOCTYPE html><p>ok</p>").unwrap();
875 let html = doc.to_html();
876 assert!(
877 html.contains("<!DOCTYPE html>"),
878 "doctype should be verbatim: {html}"
879 );
880 }
881
882 #[test]
883 fn document_to_html() {
884 let doc = parse("<!DOCTYPE html><html><body><p>Hello</p></body></html>").unwrap();
885 let html = doc.to_html();
886 assert!(html.contains("<!DOCTYPE html>"), "html: {html}");
887 assert!(html.contains("<p>Hello</p>"), "html: {html}");
888 }
889
890 #[test]
891 fn round_trip_structure() {
892 let input = "<div><p>Hello</p><span>World</span></div>";
893 let doc1 = parse(input).unwrap();
894 let html = doc1.to_html();
895 let doc2 = parse(&html).unwrap();
896 assert_eq!(doc1.root().text_content(), doc2.root().text_content());
897 assert_eq!(doc1.node_count(), doc2.node_count());
898 }
899
900 #[test]
901 fn round_trip_with_special_chars() {
902 let input = "<p>1 < 2 & 3 > 0</p>";
903 let doc1 = parse(input).unwrap();
904 assert_eq!(doc1.root().text_content(), "1 < 2 & 3 > 0");
905
906 let html = doc1.to_html();
907 let doc2 = parse(&html).unwrap();
908 assert_eq!(doc2.root().text_content(), "1 < 2 & 3 > 0");
909 }
910
911 #[test]
912 fn unknown_tag_preserved_in_to_html() {
913 let doc = parse("<my-widget>content</my-widget>").unwrap();
914 let html = doc.to_html();
915 assert!(html.contains("<my-widget>"), "html: {html}");
916 assert!(html.contains("</my-widget>"), "html: {html}");
917 }
918}