sws_scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3mod tree_sink;
4
5use std::borrow::Cow;
6use std::rc::Rc;
7
8use html5ever::driver;
9use html5ever::tree_builder::QuirksMode;
10use html5ever::QualName;
11use sws_tree::Tree;
12use tendril::TendrilSink;
13
14use crate::element_ref::{ElementRef, Select};
15use crate::node::Node;
16use crate::selector::Selector;
17
18/// An HTML tree.
19///
20/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
21/// `errors` field. The `tree` will still be populated as best as possible.
22///
23/// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed.
24#[derive(Debug, Clone, PartialEq)]
25pub struct Html {
26    /// Parse errors.
27    pub errors: Vec<Cow<'static, str>>,
28
29    /// The quirks mode.
30    pub quirks_mode: QuirksMode,
31
32    /// The node tree.
33    pub tree: Rc<Tree<Node>>,
34}
35
36impl Html {
37    /// Creates an empty HTML document.
38    pub fn new_document() -> Self {
39        Html {
40            errors: Vec::new(),
41            quirks_mode: QuirksMode::NoQuirks,
42            tree: Tree::new(Node::Document),
43        }
44    }
45
46    /// Creates an empty HTML fragment.
47    pub fn new_fragment() -> Self {
48        Html {
49            errors: Vec::new(),
50            quirks_mode: QuirksMode::NoQuirks,
51            tree: Tree::new(Node::Fragment),
52        }
53    }
54
55    /// Parses a string of HTML as a document.
56    ///
57    /// This is a convenience method for the following:
58    ///
59    /// ```
60    /// # extern crate html5ever;
61    /// # extern crate sws_scraper;
62    /// # extern crate tendril;
63    /// # fn main() {
64    /// # let document = "";
65    /// use html5ever::driver::{self, ParseOpts};
66    /// use sws_scraper::Html;
67    /// use tendril::TendrilSink;
68    ///
69    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
70    /// let html = parser.one(document);
71    /// # }
72    /// ```
73    pub fn parse_document(document: &str) -> Self {
74        let parser = driver::parse_document(Self::new_document(), Default::default());
75        parser.one(document)
76    }
77
78    /// Parses a string of HTML as a fragment.
79    pub fn parse_fragment(fragment: &str) -> Self {
80        let parser = driver::parse_fragment(
81            Self::new_fragment(),
82            Default::default(),
83            QualName::new(None, ns!(html), local_name!("body")),
84            Vec::new(),
85        );
86        parser.one(fragment)
87    }
88
89    /// Returns an iterator over elements matching a selector.
90    pub fn select(&self, selector: Selector) -> Select {
91        self.root_element().select(selector)
92    }
93
94    /// Returns the root `<html>` element.
95    pub fn root_element(&self) -> ElementRef {
96        let root_node = self
97            .tree
98            .root()
99            .children()
100            .find(|child| child.map_value(|v| v.is_element()).unwrap_or(false))
101            .expect("html node missing");
102        ElementRef::wrap(root_node).unwrap()
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    use super::Html;
109    use super::Selector;
110
111    #[test]
112    fn root_element_fragment() {
113        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
114        let root_ref = html.root_element();
115        let href = root_ref
116            .select(Selector::parse("a").unwrap())
117            .next()
118            .unwrap();
119        assert_eq!(href.inner_html(), "1");
120        assert_eq!(
121            href.map_value(|v| v.attr("href").unwrap().to_string())
122                .unwrap(),
123            "http://github.com"
124        );
125    }
126
127    #[test]
128    fn root_element_document_doctype() {
129        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
130        let root_ref = html.root_element();
131        let title = root_ref
132            .select(Selector::parse("title").unwrap())
133            .next()
134            .unwrap();
135        assert_eq!(title.inner_html(), "abc");
136    }
137
138    #[test]
139    fn root_element_document_comment() {
140        let html = Html::parse_document("<!-- comment --><title>abc</title>");
141        let root_ref = html.root_element();
142        let title = root_ref
143            .select(Selector::parse("title").unwrap())
144            .next()
145            .unwrap();
146        assert_eq!(title.inner_html(), "abc");
147    }
148}