skyscraper/html/
mod.rs

1//! Parse HTML documents into [HtmlDocuments](HtmlDocument).
2//!
3//! # Example: parse HTML text into a document
4//! ```rust
5//! use skyscraper::html::{self, grammar::HtmlParseError};
6//! # fn main() -> Result<(), HtmlParseError> {
7//! let html_text = r##"
8//! <html>
9//!     <body>
10//!         <div>Hello world</div>
11//!     </body>
12//! </html>"##;
13//!
14//! let tree = html::parse(html_text)?;
15//! # Ok(())
16//! # }
17//! ```
18
19// HtmlDocument and DocumentNode are deprecated but still used internally.
20#![allow(deprecated)]
21
22pub mod grammar;
23
24use std::{
25    collections::HashMap,
26    fmt::{self, Write},
27};
28
29use enum_extract_macro::EnumExtract;
30use indextree::{Arena, NodeId};
31use std::sync::LazyLock;
32use regex::{Captures, Regex};
33
34pub use crate::html::grammar::parse;
35pub use crate::html::grammar::parse_fragment;
36pub use crate::html::grammar::QuirksMode;
37
38/// List of HTML tags that do not have end tags and cannot have any content.
39static VOID_TAGS: &[&str] = &[
40    "meta", "link", "img", "input", "br", "hr", "col", "area", "base", "embed", "keygen",
41    "param", "source", "track", "wbr",
42];
43
44type TagAttributes = HashMap<String, String>;
45
46/// An HTML tag and its attributes.
47#[derive(Debug, PartialEq, Clone)]
48pub struct HtmlTag {
49    /// Name of the tag.
50    pub name: String,
51
52    /// Map of the tag's attributes and their corresponding values.
53    /// Example: Attributes of `<div class="hello" id="world"></div>`
54    pub attributes: TagAttributes,
55}
56
57impl HtmlTag {
58    /// Creates a new tag with the given name and no attributes.
59    pub fn new(name: String) -> HtmlTag {
60        HtmlTag {
61            name,
62            attributes: HashMap::new(),
63        }
64    }
65
66    /// Gets any direct HtmlNode::Text children and concatenates them into a single string
67    /// separated by a space character if no whitespace already separates them.
68    pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
69        self.internal_get_text(doc_node, document, false)
70    }
71
72    /// Gets all HtmlNode::Text children and concatenates them into a single string separated
73    /// by a space character if no whitespace already separates them.
74    pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
75        self.internal_get_text(doc_node, document, true)
76    }
77
78    fn internal_get_text(
79        &self,
80        doc_node: &DocumentNode,
81        document: &HtmlDocument,
82        recurse: bool,
83    ) -> Option<String> {
84        let mut o_text: Option<String> = None;
85        let mut stack: Vec<DocumentNode> = doc_node.children(document).collect();
86        stack.reverse(); // process in order by popping from end
87
88        while let Some(child) = stack.pop() {
89            let child_node = document.get_html_node(&child);
90            if let Some(child_node) = child_node {
91                match child_node {
92                    HtmlNode::Text(text) => {
93                        o_text = Some(HtmlTag::append_text(o_text, text.value.to_string()));
94                    }
95                    HtmlNode::Tag(_) => {
96                        if recurse {
97                            // Push children onto the stack in reverse order
98                            let grandchildren: Vec<DocumentNode> = child.children(document).collect();
99                            for gc in grandchildren.into_iter().rev() {
100                                stack.push(gc);
101                            }
102                        }
103                    }
104                    HtmlNode::Comment(_)
105                    | HtmlNode::ProcessingInstruction(_)
106                    | HtmlNode::Doctype(_) => {}
107                }
108            }
109        }
110
111        o_text
112    }
113
114    fn append_text(o_text: Option<String>, append_text: String) -> String {
115        match o_text {
116            Some(t) => {
117                // If whitespace is already separating them, do not add another.
118                if t.ends_with(|ch: char| ch.is_whitespace())
119                    || append_text.starts_with(|ch: char| ch.is_whitespace())
120                {
121                    format!("{}{}", t, append_text)
122                } else {
123                    format!("{} {}", t, append_text)
124                }
125            }
126            None => append_text,
127        }
128    }
129}
130
131/// Text content in an HTML document.
132#[derive(PartialEq, Clone, Debug)]
133pub struct HtmlText {
134    /// The text content.
135    ///
136    /// If the text has non-whitespace characters, it is trimmed.
137    /// Otherwise, if the text is solely whitespace, it is kept as-is.
138    /// This emulates the behaviour of Chromium browsers.
139    pub value: String,
140    /// Whether the text content is solely whitespace.
141    pub only_whitespace: bool,
142}
143
144impl HtmlText {
145    /// Creates a new [HtmlText] from the given string.
146    pub fn new(value: &str) -> HtmlText {
147        let text = unescape_characters(value);
148        let only_whitespace = text.trim().is_empty();
149        HtmlText {
150            value: text,
151            only_whitespace,
152        }
153    }
154}
155
156/// An HTML comment node (`<!-- ... -->`).
157#[derive(PartialEq, Clone, Debug)]
158pub struct HtmlComment {
159    /// The content of the comment (without the `<!--` and `-->` delimiters).
160    pub value: String,
161}
162
163impl HtmlComment {
164    /// Creates a new [HtmlComment] with the given content.
165    pub fn new(value: String) -> HtmlComment {
166        HtmlComment { value }
167    }
168}
169
170/// An HTML processing instruction (`<?target data?>`).
171#[derive(PartialEq, Clone, Debug)]
172pub struct HtmlProcessingInstruction {
173    /// The target of the processing instruction.
174    pub target: String,
175    /// The data of the processing instruction.
176    pub data: String,
177}
178
179impl HtmlProcessingInstruction {
180    /// Creates a new [HtmlProcessingInstruction] with the given target and data.
181    pub fn new(target: String, data: String) -> HtmlProcessingInstruction {
182        HtmlProcessingInstruction { target, data }
183    }
184}
185
186/// An HTML document type declaration (`<!DOCTYPE ...>`).
187#[derive(PartialEq, Clone, Debug)]
188pub struct HtmlDoctype {
189    /// The name of the document type (e.g. "html").
190    pub name: String,
191    /// The public identifier, if present.
192    pub public_id: Option<String>,
193    /// The system identifier, if present.
194    pub system_id: Option<String>,
195}
196
197impl HtmlDoctype {
198    /// Creates a new [HtmlDoctype] with the given name and optional identifiers.
199    pub fn new(name: String, public_id: Option<String>, system_id: Option<String>) -> HtmlDoctype {
200        HtmlDoctype {
201            name,
202            public_id,
203            system_id,
204        }
205    }
206}
207
208/// Unescapes commonly escaped characters in HTML text.
209///
210/// - `&amp;` becomes `&`
211/// - `&lt;` becomes `<`
212/// - `&gt;` becomes `>`
213/// - `&quot;` becomes `"`
214/// - `&#39;` becomes `'`
215pub fn unescape_characters(text: &str) -> String {
216    static NUMERIC_CHAR_REF_RE: LazyLock<Regex> =
217        LazyLock::new(|| Regex::new(r"&#(?:x([0-9a-fA-F]+)|(\d+));").unwrap());
218
219    // First: resolve numeric character references on the raw input,
220    // before any named entity replacement, to avoid double-unescaping
221    // (e.g. "&amp;#60;" should become "&#60;", not "<").
222    let text = NUMERIC_CHAR_REF_RE
223        .replace_all(text, |caps: &Captures| {
224            // Group 1: hex digits (&#xHH;), Group 2: decimal digits (&#DD;)
225            if let Some(hex) = caps.get(1) {
226                if let Ok(num) = u32::from_str_radix(hex.as_str(), 16) {
227                    return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
228                }
229            } else if let Some(dec) = caps.get(2) {
230                if let Ok(num) = dec.as_str().parse::<u32>() {
231                    return char::from_u32(num).unwrap_or('\u{FFFD}').to_string();
232                }
233            }
234            "\u{FFFD}".to_string()
235        })
236        .into_owned();
237
238    // Then: named entities. &amp; must be replaced last to avoid
239    // double-unescaping (e.g. "&amp;lt;" → "&lt;", not "<").
240    text.replace("&lt;", "<")
241        .replace("&gt;", ">")
242        .replace("&quot;", r#"""#)
243        .replace("&amp;", "&")
244}
245
246/// Escapes commonly escaped characters in HTML text.
247///
248/// - `&` becomes `&amp;`
249/// - `<` becomes `&lt;`
250/// - `>` becomes `&gt;`
251/// - `"` becomes `&quot;`
252/// - `'` becomes `&#39;`
253pub fn escape_characters(text: &str) -> String {
254    text.replace("&", "&amp;")
255        .replace("<", "&lt;")
256        .replace(">", "&gt;")
257        .replace(r#"""#, "&quot;")
258        .replace("'", "&#39;")
259}
260
261/// Trims internal whitespace from the given text such that only a single space separates words.
262/// This is used to emulate the behaviour of Chromium browsers.
263///
264/// # Example
265/// ```rust
266/// use skyscraper::html::trim_internal_whitespace;
267/// let text = "  hello  \n world  ";
268/// let result = trim_internal_whitespace(text);
269/// assert_eq!("hello world", result);
270/// ```
271pub fn trim_internal_whitespace(text: &str) -> String {
272    let mut result = String::new();
273    let mut last_char = ' ';
274    for c in text.chars() {
275        if c.is_whitespace() {
276            if !last_char.is_whitespace() {
277                result.push(' ');
278            }
279        } else {
280            result.push(c);
281        }
282        last_char = c;
283    }
284    result.trim_end().to_string()
285}
286
287/// An HTML node can be either a tag, raw text, a comment, a processing instruction, or a
288/// document type declaration.
289#[derive(Clone, Debug, EnumExtract)]
290pub enum HtmlNode {
291    /// An HTML tag.
292    Tag(HtmlTag),
293    /// Text content contained within [HtmlNode::Tag].
294    ///
295    /// Kept as separate enum value rather than a field on [HtmlTag] so
296    /// that order can be maintained in nodes containing a mix of text
297    /// and tags.
298    ///
299    /// # Example: order of mixed text and tag contents is preserved
300    /// ```html
301    /// <div>
302    ///     Hello <span style="bold">world</span>!
303    /// </div>
304    /// ```
305    /// Where the inner contents of `div` would be: `Text("Hello ")`, `Tag(span)`, `Text("!")`.
306    ///
307    Text(HtmlText),
308    /// A comment node (`<!-- ... -->`).
309    Comment(HtmlComment),
310    /// A processing instruction node (`<?target data?>`).
311    ProcessingInstruction(HtmlProcessingInstruction),
312    /// A document type declaration (`<!DOCTYPE ...>`).
313    Doctype(HtmlDoctype),
314}
315
316impl HtmlNode {
317    /// Gets any direct HtmlNode::Text children and concatenates them into a single string
318    /// separated by a space character if no whitespace already separates them.
319    pub fn get_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
320        self.internal_get_text(doc_node, document, false)
321    }
322
323    /// Gets all HtmlNode::Text children and concatenates them into a single string separated
324    /// by a space character if no whitespace already separates them.
325    pub fn get_all_text(&self, doc_node: &DocumentNode, document: &HtmlDocument) -> Option<String> {
326        self.internal_get_text(doc_node, document, true)
327    }
328
329    /// Gets any direct HtmlNode::Text children and concatenates them into a single string
330    /// separated by a space character if no whitespace already separates them.
331    fn internal_get_text(
332        &self,
333        doc_node: &DocumentNode,
334        document: &HtmlDocument,
335        recurse: bool,
336    ) -> Option<String> {
337        match self {
338            HtmlNode::Tag(tag) => {
339                if recurse {
340                    tag.get_all_text(doc_node, document)
341                } else {
342                    tag.get_text(doc_node, document)
343                }
344            }
345            HtmlNode::Text(text) => Some(text.value.to_string()),
346            // Comments, PIs, and doctypes do not contribute visible text.
347            HtmlNode::Comment(_) | HtmlNode::ProcessingInstruction(_) | HtmlNode::Doctype(_) => {
348                None
349            }
350        }
351    }
352
353    /// Gets attributes.
354    /// If Node is not a `Tag` return None
355    pub fn get_attributes(&self) -> Option<&TagAttributes> {
356        match self {
357            HtmlNode::Tag(tag) => Some(&tag.attributes),
358            _ => None,
359        }
360    }
361}
362
363/// HTML document tree represented by an indextree arena and a root node.
364///
365/// Documents must have a single root node to be valid.
366#[deprecated(
367    since = "0.8.0",
368    note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
369)]
370#[derive(Clone)]
371pub struct HtmlDocument {
372    pub(crate) arena: Arena<HtmlNode>,
373    /// The root node of the document.
374    pub root_node: DocumentNode,
375}
376
377impl HtmlDocument {
378    /// Create a new [HtmlDocument] with the given `arena` contents and `root_node`.
379    pub fn new(arena: Arena<HtmlNode>, root_node: DocumentNode) -> HtmlDocument {
380        HtmlDocument { arena, root_node }
381    }
382
383    /// Get the [HtmlNode] associated with the given [DocumentNode].
384    pub fn get_html_node(&self, node: &DocumentNode) -> Option<&HtmlNode> {
385        self.arena.get(node.id).map(|x| x.get())
386    }
387
388    /// Get a flattened string representation of this document.
389    ///
390    /// This ignores all text nodes that are solely whitespace.
391    /// It does not trim whitespace on nodes that contain both whitespace and non-whitespace.
392    pub fn to_formatted_string(&self, format_type: DocumentFormatType) -> String {
393        display_node(0, self, &self.root_node, format_type).expect("failed to display node")
394    }
395
396    /// Get an iterator over all nodes in this document.
397    pub fn iter(&self) -> impl Iterator<Item = DocumentNode> + '_ {
398        self.arena.iter().map(|node| {
399            let id = self.arena.get_node_id(node).unwrap();
400            DocumentNode::new(id)
401        })
402    }
403}
404
405impl fmt::Display for HtmlDocument {
406    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
407        let text = display_node(0, self, &self.root_node, DocumentFormatType::Standard)?;
408        write!(f, "{}", text)
409    }
410}
411
412/// Describes the formatting when converting an [HtmlDocument] to a string.
413#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
414pub enum DocumentFormatType {
415    /// Output the text as-is, without any formatting.
416    Standard,
417    /// Ignore all text nodes that are solely whitespace.
418    IgnoreWhitespace,
419    /// Indent all nodes regardless of existing whitespace.
420    Indented,
421}
422
423fn display_node(
424    start_indent: usize,
425    doc: &HtmlDocument,
426    start_node: &DocumentNode,
427    format_type: DocumentFormatType,
428) -> Result<String, fmt::Error> {
429    fn display_indent(indent: usize, str: &mut String) -> fmt::Result {
430        for _ in 0..indent {
431            write!(str, "    ")?;
432        }
433        Ok(())
434    }
435
436    enum Phase {
437        Enter(DocumentNode, usize),
438        Exit(String, usize), // tag_name, indent
439    }
440
441    let mut result = String::new();
442    let mut stack: Vec<Phase> = vec![Phase::Enter(*start_node, start_indent)];
443
444    while let Some(phase) = stack.pop() {
445        match phase {
446            Phase::Enter(doc_node, indent) => {
447                let html_node = doc.get_html_node(&doc_node).ok_or(fmt::Error)?;
448
449                match html_node {
450                    HtmlNode::Tag(tag) => {
451                        if matches!(format_type, DocumentFormatType::Indented) {
452                            display_indent(indent, &mut result)?;
453                        }
454                        write!(&mut result, "<{}", tag.name)?;
455                        let mut sorted_attrs: Vec<_> = tag.attributes.iter().collect();
456                        sorted_attrs.sort_by(|a, b| a.0.cmp(b.0));
457                        for attribute in sorted_attrs {
458                            write!(&mut result, r#" {}="{}""#, attribute.0, attribute.1)?;
459                        }
460                        write!(&mut result, ">")?;
461                        if matches!(format_type, DocumentFormatType::Indented) {
462                            writeln!(&mut result)?;
463                        }
464
465                        if !VOID_TAGS.contains(&tag.name.as_str()) {
466                            // Push the exit phase first (will be processed after children).
467                            stack.push(Phase::Exit(tag.name.clone(), indent));
468
469                            // Push children in reverse order so they are processed in order.
470                            let children: Vec<DocumentNode> = doc_node.children(doc).collect();
471                            for child in children.into_iter().rev() {
472                                stack.push(Phase::Enter(child, indent + 1));
473                            }
474                        }
475                    }
476                    HtmlNode::Text(text) => {
477                        let output_text = escape_characters(text.value.as_str());
478                        match format_type {
479                            DocumentFormatType::Standard => {
480                                write!(&mut result, "{}", output_text)?;
481                            }
482                            DocumentFormatType::IgnoreWhitespace => {
483                                if !text.only_whitespace {
484                                    write!(&mut result, "{}", output_text)?;
485                                }
486                            }
487                            DocumentFormatType::Indented => {
488                                if !text.only_whitespace {
489                                    display_indent(indent, &mut result)?;
490                                    writeln!(&mut result, "{}", output_text.trim())?;
491                                }
492                            }
493                        }
494                    }
495                    HtmlNode::Comment(comment) => {
496                        if matches!(format_type, DocumentFormatType::Indented) {
497                            display_indent(indent, &mut result)?;
498                        }
499                        let sanitized = comment.value.replace("--", "- -");
500                        write!(&mut result, "<!--{}-->", sanitized)?;
501                        if matches!(format_type, DocumentFormatType::Indented) {
502                            writeln!(&mut result)?;
503                        }
504                    }
505                    HtmlNode::ProcessingInstruction(pi) => {
506                        if matches!(format_type, DocumentFormatType::Indented) {
507                            display_indent(indent, &mut result)?;
508                        }
509                        if pi.data.is_empty() {
510                            write!(&mut result, "<?{}?>", pi.target)?;
511                        } else {
512                            write!(&mut result, "<?{} {}?>", pi.target, pi.data)?;
513                        }
514                        if matches!(format_type, DocumentFormatType::Indented) {
515                            writeln!(&mut result)?;
516                        }
517                    }
518                    HtmlNode::Doctype(doctype) => {
519                        if matches!(format_type, DocumentFormatType::Indented) {
520                            display_indent(indent, &mut result)?;
521                        }
522                        write!(&mut result, "<!DOCTYPE {}", doctype.name)?;
523                        if let Some(ref public_id) = doctype.public_id {
524                            write!(&mut result, r#" PUBLIC "{}""#, public_id)?;
525                            if let Some(ref system_id) = doctype.system_id {
526                                write!(&mut result, r#" "{}""#, system_id)?;
527                            }
528                        } else if let Some(ref system_id) = doctype.system_id {
529                            write!(&mut result, r#" SYSTEM "{}""#, system_id)?;
530                        }
531                        write!(&mut result, ">")?;
532                        if matches!(format_type, DocumentFormatType::Indented) {
533                            writeln!(&mut result)?;
534                        }
535                    }
536                }
537            }
538            Phase::Exit(tag_name, indent) => {
539                if matches!(format_type, DocumentFormatType::Indented) {
540                    display_indent(indent, &mut result)?;
541                }
542                write!(&mut result, "</{}>", tag_name)?;
543                if matches!(format_type, DocumentFormatType::Indented) {
544                    writeln!(&mut result)?;
545                }
546            }
547        }
548    }
549
550    Ok(result)
551}
552
553/// A key representing a single [HtmlNode] contained in a [HtmlDocument].
554///
555/// Contains tree information such as parents and children.
556///
557/// Implements [Copy] so that it can be easily passed around, unlike its associated [HtmlNode].
558///
559/// # Example: get the root element
560///
561/// ```rust
562/// # use std::error::Error;
563/// # fn main() -> Result<(), Box<dyn Error>> {
564/// use skyscraper::html;
565/// use skyscraper::xpath;
566///
567/// // Parse the HTML text into a tree
568/// let text = r#"<div></div>"#;
569/// let tree = html::parse(text)?;
570///
571/// // Use XPath to find elements
572/// let xpath = xpath::parse("//div")?;
573/// let items = xpath.apply(&tree)?;
574/// assert_eq!(items.len(), 1);
575///
576/// let element = items[0].extract_as_node().extract_as_element_node();
577/// assert_eq!(element.name, "div");
578/// # Ok(())
579/// # }
580/// ```
581///
582/// # Example: get children
583///
584/// ```rust
585/// # use std::error::Error;
586/// # fn main() -> Result<(), Box<dyn Error>> {
587/// use skyscraper::html;
588/// use skyscraper::xpath;
589///
590/// // Parse the HTML text into a tree
591/// let text = r#"<ul><li></li><li></li></ul>"#;
592/// let tree = html::parse(text)?;
593///
594/// // Find the ul element and its li children
595/// let xpath = xpath::parse("//ul")?;
596/// let items = xpath.apply(&tree)?;
597/// assert_eq!(items.len(), 1);
598///
599/// let ul = items[0].extract_as_node().extract_as_element_node();
600/// let children: Vec<_> = ul.children(&tree).collect();
601/// let element_children: Vec<_> = children.iter()
602///     .filter_map(|c| c.as_element_node().ok())
603///     .collect();
604/// assert_eq!(2, element_children.len());
605/// # Ok(())
606/// # }
607/// ```
608#[deprecated(
609    since = "0.8.0",
610    note = "Use `XpathItemTree` directly via `html::parse()` and `XpathItemTree::from(&doc)` instead"
611)]
612#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug, Hash)]
613pub struct DocumentNode {
614    id: NodeId,
615}
616
617impl DocumentNode {
618    /// Create a new [DocumentNode] from the given arena key `id`.
619    pub fn new(id: NodeId) -> DocumentNode {
620        DocumentNode { id }
621    }
622
623    /// Get the concatenated text of this node and all of its children.
624    ///
625    /// Adds a space between elements for better readability.
626    ///
627    /// # Example: get the text of a node
628    ///
629    /// ```rust
630    /// # use std::error::Error;
631    /// # fn main() -> Result<(), Box<dyn Error>> {
632    /// use skyscraper::html;
633    /// use skyscraper::xpath;
634    ///
635    /// // Parse the text into a tree.
636    /// let text = r##"<parent>foo<child>bar</child>baz</parent>"##;
637    /// let tree = html::parse(text)?;
638    ///
639    /// // Get the text content of the parent element using XPath.
640    /// let xp = xpath::parse("//parent")?;
641    /// let items = xp.apply(&tree)?;
642    /// let element = items[0].extract_as_node().extract_as_element_node();
643    /// let text_content = element.text_content(&tree);
644    ///
645    /// assert_eq!("foobarbaz", text_content);
646    /// # Ok(())
647    /// # }
648    /// ```
649    pub fn get_all_text(&self, document: &HtmlDocument) -> Option<String> {
650        match document.get_html_node(self) {
651            Some(html_node) => html_node.get_all_text(self, document),
652            None => None,
653        }
654    }
655
656    /// Get the concatenated text of this node.
657    ///
658    /// Adds a space between elements for better readability.
659    ///
660    /// # Example: get the text of a node
661    ///
662    /// ```rust
663    /// # use std::error::Error;
664    /// # fn main() -> Result<(), Box<dyn Error>> {
665    /// use skyscraper::html;
666    /// use skyscraper::xpath;
667    ///
668    /// // Parse the text into a tree.
669    /// let html_text = r##"<parent>foo<child>bar</child>baz</parent>"##;
670    /// let tree = html::parse(html_text)?;
671    ///
672    /// // Get the direct text of the parent element using XPath.
673    /// let xp = xpath::parse("//parent")?;
674    /// let items = xp.apply(&tree)?;
675    /// let element = items[0].extract_as_node().extract_as_element_node();
676    /// let text = element.text(&tree).unwrap();
677    ///
678    /// assert_eq!("foo", text);
679    /// # Ok(())
680    /// # }
681    /// ```
682    pub fn get_text(&self, document: &HtmlDocument) -> Option<String> {
683        match document.get_html_node(self) {
684            Some(html_node) => html_node.get_text(self, document),
685            None => None,
686        }
687    }
688
689    /// Get attributes.
690    ///
691    /// If Node is a `Text` return None
692    ///
693    /// ```rust
694    /// # use std::error::Error;
695    /// # fn main() -> Result<(), Box<dyn Error>> {
696    /// use skyscraper::html;
697    /// use skyscraper::xpath;
698    ///
699    /// // Parse the text into a tree.
700    /// let html_text = r##"<div attr1="attr1_value"></div>"##;
701    /// let tree = html::parse(html_text)?;
702    ///
703    /// // Get the attribute using XPath.
704    /// let xp = xpath::parse("//div")?;
705    /// let items = xp.apply(&tree)?;
706    /// let element = items[0].extract_as_node().extract_as_element_node();
707    /// let attr = element.get_attribute(&tree, "attr1").unwrap();
708    ///
709    /// assert_eq!("attr1_value", attr);
710    /// # Ok(())
711    /// # }
712    /// ```
713    pub fn get_attributes<'a>(&'a self, document: &'a HtmlDocument) -> Option<&'a TagAttributes> {
714        match document.get_html_node(self) {
715            Some(html_node) => html_node.get_attributes(),
716            None => None,
717        }
718    }
719
720    /// Get the children of this node as an iterator.
721    pub fn children<'a>(
722        &self,
723        document: &'a HtmlDocument,
724    ) -> impl Iterator<Item = DocumentNode> + 'a {
725        Box::new(self.id.children(&document.arena).map(DocumentNode::new))
726    }
727
728    /// Get the parent of this node if it exists.
729    pub fn parent(&self, document: &HtmlDocument) -> Option<DocumentNode> {
730        self.id
731            .ancestors(&document.arena)
732            .nth(1)
733            .map(DocumentNode::new)
734    }
735}
736
737#[cfg(test)]
738mod tests {
739    use indoc::indoc;
740
741    use super::*;
742
743    #[test]
744    fn html_node_get_text_should_work_on_text_node() {
745        // arrange
746        let mut arena = Arena::new();
747        let text_node = HtmlNode::Text(HtmlText::new("hello world"));
748        let text_doc_node = DocumentNode::new(arena.new_node(text_node));
749        let document = HtmlDocument::new(arena, text_doc_node);
750
751        // act
752        let text_node = document.get_html_node(&text_doc_node).unwrap();
753        let result = text_node.get_text(&text_doc_node, &document).unwrap();
754
755        // assert
756        assert_eq!("hello world", result);
757    }
758
759    #[test]
760    fn html_node_get_text_should_work_on_tag_node_with_one_text_child() {
761        // arrange
762        let mut arena = Arena::new();
763        let text_node = HtmlNode::Text(HtmlText::new("hello world"));
764        let text_node_id = arena.new_node(text_node);
765
766        let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
767        let tag_node_id = arena.new_node(tag_node);
768        let tag_doc_node = DocumentNode::new(tag_node_id);
769        tag_node_id.append(text_node_id, &mut arena);
770
771        let document = HtmlDocument::new(arena, tag_doc_node);
772
773        // act
774        let tag_node = document.get_html_node(&tag_doc_node).unwrap();
775        let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
776
777        // assert
778        assert_eq!("hello world", result);
779    }
780
781    #[test]
782    fn html_node_get_text_should_work_on_tag_node_with_two_text_children() {
783        // arrange
784        let mut arena = Arena::new();
785        let text_node = HtmlNode::Text(HtmlText::new("hello"));
786        let text_node_id = arena.new_node(text_node);
787
788        let text_node2 = HtmlNode::Text(HtmlText::new("world"));
789        let text_node2_id = arena.new_node(text_node2);
790
791        let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
792        let tag_node_id = arena.new_node(tag_node);
793        tag_node_id.append(text_node_id, &mut arena);
794        tag_node_id.append(text_node2_id, &mut arena);
795        let tag_doc_node = DocumentNode::new(tag_node_id);
796
797        let document = HtmlDocument::new(arena, tag_doc_node);
798
799        // act
800        let tag_node = document.get_html_node(&tag_doc_node).unwrap();
801        let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
802
803        // assert
804        assert_eq!("hello world", result);
805    }
806
807    #[test]
808    fn html_node_get_text_should_ignore_nested_text() {
809        // arrange
810        let mut arena = Arena::new();
811        let text_node = HtmlNode::Text(HtmlText::new("hello"));
812        let text_node_id = arena.new_node(text_node);
813
814        let text_node2 = HtmlNode::Text(HtmlText::new("world"));
815        let text_node2_id = arena.new_node(text_node2);
816
817        let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
818        let tag_node_id = arena.new_node(tag_node);
819        tag_node_id.append(text_node_id, &mut arena);
820
821        let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
822        let tag_node2_id = arena.new_node(tag_node2);
823        tag_node2_id.append(text_node2_id, &mut arena);
824        tag_node_id.append(tag_node2_id, &mut arena);
825        let tag_doc_node = DocumentNode::new(tag_node_id);
826
827        let document = HtmlDocument::new(arena, tag_doc_node);
828
829        // act
830        let tag_node = document.get_html_node(&tag_doc_node).unwrap();
831        let result = tag_node.get_text(&tag_doc_node, &document).unwrap();
832
833        // assert
834        assert_eq!("hello", result);
835    }
836
837    #[test]
838    fn html_node_get_all_text_should_include_nested_text() {
839        // arrange
840        let mut arena = Arena::new();
841        let text_node = HtmlNode::Text(HtmlText::new("hello"));
842        let text_node_id = arena.new_node(text_node);
843
844        let text_node2 = HtmlNode::Text(HtmlText::new("world"));
845        let text_node2_id = arena.new_node(text_node2);
846
847        let tag_node = HtmlNode::Tag(HtmlTag::new(String::from("tag")));
848        let tag_node_id = arena.new_node(tag_node);
849        tag_node_id.append(text_node_id, &mut arena);
850
851        let tag_node2 = HtmlNode::Tag(HtmlTag::new(String::from("tag2")));
852        let tag_node2_id = arena.new_node(tag_node2);
853        tag_node2_id.append(text_node2_id, &mut arena);
854        tag_node_id.append(tag_node2_id, &mut arena);
855        let tag_doc_node = DocumentNode::new(tag_node_id);
856
857        let document = HtmlDocument::new(arena, tag_doc_node);
858
859        // act
860        let tag_node = document.get_html_node(&tag_doc_node).unwrap();
861        let result = tag_node.get_all_text(&tag_doc_node, &document).unwrap();
862
863        // assert
864        assert_eq!("hello world", result);
865    }
866
867    #[test]
868    fn html_node_get_attributes_for_tag() {
869        // arrange
870        let node = HtmlNode::Tag(HtmlTag {
871            name: "div".to_string(),
872            attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
873        });
874
875        // assert
876        assert!(node.get_attributes().is_some());
877        assert_eq!(node.get_attributes().unwrap()["attr_name"], "attr_value");
878    }
879
880    #[test]
881    fn html_node_get_attributes_for_text() {
882        // arrange
883        let node = HtmlNode::Text(HtmlText::new("hello world"));
884
885        // assert
886        assert!(node.get_attributes().is_none())
887    }
888
889    #[test]
890    fn document_node_get_attributes_for_tag() {
891        // arrange
892        let mut arena = Arena::new();
893        let html_node = HtmlNode::Tag(HtmlTag {
894            name: "div".to_string(),
895            attributes: HashMap::from([("attr_name".to_string(), "attr_value".to_string())]),
896        });
897        let doc_node = DocumentNode::new(arena.new_node(html_node));
898        let html_document = HtmlDocument::new(arena, doc_node);
899
900        // act
901        let node = html_document.get_html_node(&doc_node).unwrap();
902        let attributes = node.get_attributes();
903
904        // assert
905        assert!(attributes.is_some());
906        assert_eq!(attributes.unwrap()["attr_name"], "attr_value");
907    }
908
909    #[test]
910    fn document_node_get_attributes_for_text() {
911        // arrange
912        let mut arena = Arena::new();
913        let html_node = HtmlNode::Text(HtmlText::new("hello world"));
914        let doc_node = DocumentNode::new(arena.new_node(html_node));
915        let html_document = HtmlDocument::new(arena, doc_node);
916
917        // act
918        let node = html_document.get_html_node(&doc_node).unwrap();
919        let attributes = node.get_attributes();
920
921        // assert
922        assert!(attributes.is_none());
923    }
924
925}
skyscraper/html/mod.rs

skyscraper/html/
mod.rs