1#![allow(deprecated)]
21
22pub mod grammar;
23
24use std::{
25 collections::HashMap,
26 fmt::{self, Write},
27};
28
29use enum_extract_macro::EnumExtract;
30use indextree::{Arena, NodeId};
31use std::sync::LazyLock;
32use regex::{Captures, Regex};
33
34pub use crate::html::grammar::parse;
35pub use crate::html::grammar::parse_fragment;
36pub use crate::html::grammar::QuirksMode;
37
38static VOID_TAGS: &[&str] = &[
40 "meta", "link", "img", "input", "br", "hr", "col", "area", "base", "embed", "keygen",
41 "param", "source", "track", "wbr",
42];
43
44type TagAttributes = HashMap<String, String>;
45
46#[derive(Debug, PartialEq, Clone)]
48pub struct HtmlTag {
49 pub name: String,
51
52 pub attributes: TagAttributes,
55}
56
57impl HtmlTag {
58 pub fn new(name: String) -> HtmlTag {
60 HtmlTag {
61 name,
62 attributes: HashMap::new(),
63 }
64 }
65
66 pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
69 self.internal_get_text(doc_node, document, false)
70 }
71
72 pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
75 self.internal_get_text(doc_node, document, true)
76 }
77
78 fn internal_get_text(
79 &self,
80 doc_node: &DocumentNode,
81 document: &HtmlDocument,
82 recurse: bool,
83 ) -> Option<String> {
84 let mut o_text: Option<String> = None;
85 let mut stack: Vec<DocumentNode> = doc_node.children(document).collect();
86 stack.reverse(); while let Some(child) = stack.pop() {
89 let child_node = document.get_html_node(&child);
90 if let Some(child_node) = child_node {
91 match child_node {
92 HtmlNode::Text(text) => {
93 o_text = Some(HtmlTag::append_text(o_text, text.value.to_string()));
94 }
95 HtmlNode::Tag(_) => {
96 if recurse {
97 let grandchildren: Vec<DocumentNode> = child.children(document).collect();
99 for gc in grandchildren.into_iter().rev() {
100 stack.push(gc);
101 }
102 }
103 }
104 HtmlNode::Comment(_)
105 | HtmlNode::ProcessingInstruction(_)
106 | HtmlNode::Doctype(_) => {}
107 }
108 }
109 }
110
111 o_text
112 }
113
114 fn append_text(o_text: Option<String>, append_text: String) -> String {
115 match o_text {
116 Some(t) => {
117 if t.ends_with(|ch: char| ch.is_whitespace())
119 || append_text.starts_with(|ch: char| ch.is_whitespace())
120 {
121 format!("{}{}", t, append_text)
122 } else {
123 format!("{} {}", t, append_text)
124 }
125 }
126 None => append_text,
127 }
128 }
129}
130
131#[derive(PartialEq, Clone, Debug)]
133pub struct HtmlText {
134 pub value: String,
140 pub only_whitespace: bool,
142}
143
144impl HtmlText {
145 pub fn new(value: &str) -> HtmlText {
147 let text = unescape_characters(value);
148 let only_whitespace = text.trim().is_empty();
149 HtmlText {
150 value: text,
151 only_whitespace,
152 }
153 }
154}
155
156#[derive(PartialEq, Clone, Debug)]
158pub struct HtmlComment {
159 pub value: String,
161}
162
163impl HtmlComment {
164 pub fn new(value: String) -> HtmlComment {
166 HtmlComment { value }
167 }
168}
169
170#[derive(PartialEq, Clone, Debug)]
172pub struct HtmlProcessingInstruction {
173 pub target: String,
175 pub data: String,
177}
178
179impl HtmlProcessingInstruction {
180 pub fn new(target: String, data: String) -> HtmlProcessingInstruction {
182 HtmlProcessingInstruction { target, data }
183 }
184}
185
186#[derive(PartialEq, Clone, Debug)]
188pub struct HtmlDoctype {
189 pub name: String,
191 pub public_id: Option<String>,
193 pub system_id: Option<String>,
195}
196
197impl HtmlDoctype {
198 pub fn new(name: String, public_id: Option<String>, system_id: Option<String>) -> HtmlDoctype {
200 HtmlDoctype {
201 name,
202 public_id,
203 system_id,
204 }
205 }
206}
207
208pub fn unescape_characters(text: &str) -> String {
216 static NUMERIC_CHAR_REF_RE: LazyLock<Regex> =
217 LazyLock::new(|| Regex::new(r"&#(?:x([0-9a-fA-F]+)|(\d+));").unwrap());
218
219 let text = NUMERIC_CHAR_REF_RE
223 .replace_all(text, |caps: &Captures| {
224 if let Some(hex) = caps.get(1) {
226 if let Ok(num) = u32::from_str_radix(hex.as_str(), 16) {
227 return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
228 }
229 } else if let Some(dec) = caps.get(2) {
230 if let Ok(num) = dec.as_str().parse::<u32>() {
231 return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
232 }
233 }
234 "\u{FFFD}".to_string()
235 })
236 .into_owned();
237
238 text.replace("<", "<")
241 .replace(">", ">")
242 .replace(""", r#"""#)
243 .replace("&", "&")
244}
245
246pub fn escape_characters(text: &str) -> String {
254 text.replace("&", "&")
255 .replace("<", "<")
256 .replace(">", ">")
257 .replace(r#"""#, """)
258 .replace("'", "'")
259}
260
261pub fn trim_internal_whitespace(text: &str) -> String {
272 let mut result = String::new();
273 let mut last_char = ' ';
274 for c in text.chars() {
275 if c.is_whitespace() {
276 if !last_char.is_whitespace() {
277 result.push(' ');
278 }
279 } else {
280 result.push(c);
281 }
282 last_char = c;
283 }
284 result.trim_end().to_string()
285}
286
287#[derive(Clone, Debug, EnumExtract)]
290pub enum HtmlNode {
291 Tag(HtmlTag),
293 Text(HtmlText),
308 Comment(HtmlComment),
310 ProcessingInstruction(HtmlProcessingInstruction),
312 Doctype(HtmlDoctype),
314}
315
316impl HtmlNode {
317 pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
320 self.internal_get_text(doc_node, document, false)
321 }
322
323 pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
326 self.internal_get_text(doc_node, document, true)
327 }
328
329 fn internal_get_text(
332 &self,
333 doc_node: &DocumentNode,
334 document: &HtmlDocument,
335 recurse: bool,
336 ) -> Option<String> {
337 match self {
338 HtmlNode::Tag(tag) => {
339 if recurse {
340 tag.get_all_text(doc_node, document)
341 } else {
342 tag.get_text(doc_node, document)
343 }
344 }
345 HtmlNode::Text(text) => Some(text.value.to_string()),
346 HtmlNode::Comment(_) | HtmlNode::ProcessingInstruction(_) | HtmlNode::Doctype(_) => {
348 None
349 }
350 }
351 }
352
353 pub fn get_attributes(&self) -> Option<&TagAttributes> {
356 match self {
357 HtmlNode::Tag(tag) => Some(&tag.attributes),
358 _ => None,
359 }
360 }
361}
362
363#[deprecated(
367 since = "0.8.0",
368 note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
369)]
370#[derive(Clone)]
371pub struct HtmlDocument {
372 pub(crate) arena: Arena<HtmlNode>,
373 pub root_node: DocumentNode,
375}
376
377impl HtmlDocument {
378 pub fn new(arena: Arena<HtmlNode>, root_node: DocumentNode) -> HtmlDocument {
380 HtmlDocument { arena, root_node }
381 }
382
383 pub fn get_html_node(&self, node: &DocumentNode) -> Option<&HtmlNode> {
385 self.arena.get(node.id).map(|x| x.get())
386 }
387
388 pub fn to_formatted_string(&self, format_type: DocumentFormatType) -> String {
393 display_node(0, self, &self.root_node, format_type).expect("failed to display node")
394 }
395
396 pub fn iter(&self) -> impl Iterator<Item = DocumentNode> + '_ {
398 self.arena.iter().map(|node| {
399 let id = self.arena.get_node_id(node).unwrap();
400 DocumentNode::new(id)
401 })
402 }
403}
404
405impl fmt::Display for HtmlDocument {
406 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
407 let text = display_node(0, self, &self.root_node, DocumentFormatType::Standard)?;
408 write!(f, "{}", text)
409 }
410}
411
412#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
414pub enum DocumentFormatType {
415 Standard,
417 IgnoreWhitespace,
419 Indented,
421}
422
423fn display_node(
424 start_indent: usize,
425 doc: &HtmlDocument,
426 start_node: &DocumentNode,
427 format_type: DocumentFormatType,
428) -> Result<String, fmt::Error> {
429 fn display_indent(indent: usize, str: &mut String) -> fmt::Result {
430 for _ in 0..indent {
431 write!(str, " ")?;
432 }
433 Ok(())
434 }
435
436 enum Phase {
437 Enter(DocumentNode, usize),
438 Exit(String, usize), }
440
441 let mut result = String::new();
442 let mut stack: Vec<Phase> = vec![Phase::Enter(*start_node, start_indent)];
443
444 while let Some(phase) = stack.pop() {
445 match phase {
446 Phase::Enter(doc_node, indent) => {
447 let html_node = doc.get_html_node(&doc_node).ok_or(fmt::Error)?;
448
449 match html_node {
450 HtmlNode::Tag(tag) => {
451 if matches!(format_type, DocumentFormatType::Indented) {
452 display_indent(indent, &mut result)?;
453 }
454 write!(&mut result, "<{}", tag.name)?;
455 let mut sorted_attrs: Vec<_> = tag.attributes.iter().collect();
456 sorted_attrs.sort_by(|a, b| a.0.cmp(b.0));
457 for attribute in sorted_attrs {
458 write!(&mut result, r#" {}="{}""#, attribute.0, attribute.1)?;
459 }
460 write!(&mut result, ">")?;
461 if matches!(format_type, DocumentFormatType::Indented) {
462 writeln!(&mut result)?;
463 }
464
465 if !VOID_TAGS.contains(&tag.name.as_str()) {
466 stack.push(Phase::Exit(tag.name.clone(), indent));
468
469 let children: Vec<DocumentNode> = doc_node.children(doc).collect();
471 for child in children.into_iter().rev() {
472 stack.push(Phase::Enter(child, indent + 1));
473 }
474 }
475 }
476 HtmlNode::Text(text) => {
477 let output_text = escape_characters(text.value.as_str());
478 match format_type {
479 DocumentFormatType::Standard => {
480 write!(&mut result, "{}", output_text)?;
481 }
482 DocumentFormatType::IgnoreWhitespace => {
483 if !text.only_whitespace {
484 write!(&mut result, "{}", output_text)?;
485 }
486 }
487 DocumentFormatType::Indented => {
488 if !text.only_whitespace {
489 display_indent(indent, &mut result)?;
490 writeln!(&mut result, "{}", output_text.trim())?;
491 }
492 }
493 }
494 }
495 HtmlNode::Comment(comment) => {
496 if matches!(format_type, DocumentFormatType::Indented) {
497 display_indent(indent, &mut result)?;
498 }
499 let sanitized = comment.value.replace("--", "- -");
500 write!(&mut result, "<!--{}-->", sanitized)?;
501 if matches!(format_type, DocumentFormatType::Indented) {
502 writeln!(&mut result)?;
503 }
504 }
505 HtmlNode::ProcessingInstruction(pi) => {
506 if matches!(format_type, DocumentFormatType::Indented) {
507 display_indent(indent, &mut result)?;
508 }
509 if pi.data.is_empty() {
510 write!(&mut result, "<?{}?>", pi.target)?;
511 } else {
512 write!(&mut result, "<?{} {}?>", pi.target, pi.data)?;
513 }
514 if matches!(format_type, DocumentFormatType::Indented) {
515 writeln!(&mut result)?;
516 }
517 }
518 HtmlNode::Doctype(doctype) => {
519 if matches!(format_type, DocumentFormatType::Indented) {
520 display_indent(indent, &mut result)?;
521 }
522 write!(&mut result, "<!DOCTYPE {}", doctype.name)?;
523 if let Some(ref public_id) = doctype.public_id {
524 write!(&mut result, r#" PUBLIC "{}""#, public_id)?;
525 if let Some(ref system_id) = doctype.system_id {
526 write!(&mut result, r#" "{}""#, system_id)?;
527 }
528 } else if let Some(ref system_id) = doctype.system_id {
529 write!(&mut result, r#" SYSTEM "{}""#, system_id)?;
530 }
531 write!(&mut result, ">")?;
532 if matches!(format_type, DocumentFormatType::Indented) {
533 writeln!(&mut result)?;
534 }
535 }
536 }
537 }
538 Phase::Exit(tag_name, indent) => {
539 if matches!(format_type, DocumentFormatType::Indented) {
540 display_indent(indent, &mut result)?;
541 }
542 write!(&mut result, "</{}>", tag_name)?;
543 if matches!(format_type, DocumentFormatType::Indented) {
544 writeln!(&mut result)?;
545 }
546 }
547 }
548 }
549
550 Ok(result)
551}
552
553#[deprecated(
609 since = "0.8.0",
610 note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
611)]
612#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
613pub struct DocumentNode {
614 id: NodeId,
615}
616
617impl DocumentNode {
618 pub fn new(id: NodeId) -> DocumentNode {
620 DocumentNode { id }
621 }
622
623 pub fn get_all_text(&self, document: &HtmlDocument) -> Option<String> {
650 match document.get_html_node(self) {
651 Some(html_node) => html_node.get_all_text(self, document),
652 None => None,
653 }
654 }
655
656 pub fn get_text(&self, document: &HtmlDocument) -> Option<String> {
683 match document.get_html_node(self) {
684 Some(html_node) => html_node.get_text(self, document),
685 None => None,
686 }
687 }
688
689 pub fn get_attributes<'a>(&'a self, document: &'a HtmlDocument) -> Option<&'a TagAttributes> {
714 match document.get_html_node(self) {
715 Some(html_node) => html_node.get_attributes(),
716 None => None,
717 }
718 }
719
720 pub fn children<'a>(
722 &self,
723 document: &'a HtmlDocument,
724 ) -> impl Iterator<Item = DocumentNode> + 'a {
725 Box::new(self.id.children(&document.arena).map(DocumentNode::new))
726 }
727
728 pub fn parent(&self, document: &HtmlDocument) -> Option<DocumentNode> {
730 self.id
731 .ancestors(&document.arena)
732 .nth(1)
733 .map(DocumentNode::new)
734 }
735}
736
737#[cfg(test)]
738mod tests {
739 use indoc::indoc;
740
741 use super::*;
742
743 #[test]
744 fn html_node_get_text_should_work_on_text_node() {
745 let mut arena = Arena::new();
747 let text_node = HtmlNode::Text(HtmlText::new("hello world"));
748 let text_doc_node = DocumentNode::new(arena.new_node(text_node));
749 let document = HtmlDocument::new(arena, text_doc_node);
750
751 let text_node = document.get_html_node(&text_doc_node).unwrap();
753 let result = text_node.get_text(&text_doc_node, &document).unwrap();
754
755 assert_eq!("hello world", result);
757 }
758
759 #[test]
760 fn html_node_get_text_should_work_on_tag_node_with_one_text_child() {
761 let mut arena = Arena::new();
763 let text_node = HtmlNode::Text(HtmlText::new("hello world"));
764 let text_node_id = arena.new_node(text_node);
765
766 let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
767 let tag_node_id = arena.new_node(tag_node);
768 let tag_doc_node = DocumentNode::new(tag_node_id);
769 tag_node_id.append(text_node_id, &mut arena);
770
771 let document = HtmlDocument::new(arena, tag_doc_node);
772
773 let tag_node = document.get_html_node(&tag_doc_node).unwrap();
775 let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
776
777 assert_eq!("hello world", result);
779 }
780
781 #[test]
782 fn html_node_get_text_should_work_on_tag_node_with_two_text_children() {
783 let mut arena = Arena::new();
785 let text_node = HtmlNode::Text(HtmlText::new("hello"));
786 let text_node_id = arena.new_node(text_node);
787
788 let text_node2 = HtmlNode::Text(HtmlText::new("world"));
789 let text_node2_id = arena.new_node(text_node2);
790
791 let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
792 let tag_node_id = arena.new_node(tag_node);
793 tag_node_id.append(text_node_id, &mut arena);
794 tag_node_id.append(text_node2_id, &mut arena);
795 let tag_doc_node = DocumentNode::new(tag_node_id);
796
797 let document = HtmlDocument::new(arena, tag_doc_node);
798
799 let tag_node = document.get_html_node(&tag_doc_node).unwrap();
801 let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
802
803 assert_eq!("hello world", result);
805 }
806
807 #[test]
808 fn html_node_get_text_should_ignore_nested_text() {
809 let mut arena = Arena::new();
811 let text_node = HtmlNode::Text(HtmlText::new("hello"));
812 let text_node_id = arena.new_node(text_node);
813
814 let text_node2 = HtmlNode::Text(HtmlText::new("world"));
815 let text_node2_id = arena.new_node(text_node2);
816
817 let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
818 let tag_node_id = arena.new_node(tag_node);
819 tag_node_id.append(text_node_id, &mut arena);
820
821 let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
822 let tag_node2_id = arena.new_node(tag_node2);
823 tag_node2_id.append(text_node2_id, &mut arena);
824 tag_node_id.append(tag_node2_id, &mut arena);
825 let tag_doc_node = DocumentNode::new(tag_node_id);
826
827 let document = HtmlDocument::new(arena, tag_doc_node);
828
829 let tag_node = document.get_html_node(&tag_doc_node).unwrap();
831 let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
832
833 assert_eq!("hello", result);
835 }
836
837 #[test]
838 fn html_node_get_all_text_should_include_nested_text() {
839 let mut arena = Arena::new();
841 let text_node = HtmlNode::Text(HtmlText::new("hello"));
842 let text_node_id = arena.new_node(text_node);
843
844 let text_node2 = HtmlNode::Text(HtmlText::new("world"));
845 let text_node2_id = arena.new_node(text_node2);
846
847 let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
848 let tag_node_id = arena.new_node(tag_node);
849 tag_node_id.append(text_node_id, &mut arena);
850
851 let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
852 let tag_node2_id = arena.new_node(tag_node2);
853 tag_node2_id.append(text_node2_id, &mut arena);
854 tag_node_id.append(tag_node2_id, &mut arena);
855 let tag_doc_node = DocumentNode::new(tag_node_id);
856
857 let document = HtmlDocument::new(arena, tag_doc_node);
858
859 let tag_node = document.get_html_node(&tag_doc_node).unwrap();
861 let result = tag_node.get_all_text(&tag_doc_node, &document).unwrap();
862
863 assert_eq!("hello world", result);
865 }
866
867 #[test]
868 fn html_node_get_attributes_for_tag() {
869 let node = HtmlNode::Tag(HtmlTag {
871 name: "div".to_string(),
872 attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
873 });
874
875 assert!(node.get_attributes().is_some());
877 assert_eq!(node.get_attributes().unwrap()["attr_name"], "attr_value");
878 }
879
880 #[test]
881 fn html_node_get_attributes_for_text() {
882 let node = HtmlNode::Text(HtmlText::new("hello world"));
884
885 assert!(node.get_attributes().is_none())
887 }
888
889 #[test]
890 fn document_node_get_attributes_for_tag() {
891 let mut arena = Arena::new();
893 let html_node = HtmlNode::Tag(HtmlTag {
894 name: "div".to_string(),
895 attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
896 });
897 let doc_node = DocumentNode::new(arena.new_node(html_node));
898 let html_document = HtmlDocument::new(arena, doc_node);
899
900 let node = html_document.get_html_node(&doc_node).unwrap();
902 let attributes = node.get_attributes();
903
904 assert!(attributes.is_some());
906 assert_eq!(attributes.unwrap()["attr_name"], "attr_value");
907 }
908
909 #[test]
910 fn document_node_get_attributes_for_text() {
911 let mut arena = Arena::new();
913 let html_node = HtmlNode::Text(HtmlText::new("hello world"));
914 let doc_node = DocumentNode::new(arena.new_node(html_node));
915 let html_document = HtmlDocument::new(arena, doc_node);
916
917 let node = html_document.get_html_node(&doc_node).unwrap();
919 let attributes = node.get_attributes();
920
921 assert!(attributes.is_none());
923 }
924
925}