scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3use ego_tree::iter::Nodes;
4use ego_tree::{NodeId, Tree};
5use fast_html5ever::serialize::SerializeOpts;
6use fast_html5ever::tree_builder::QuirksMode;
7use fast_html5ever::QualName;
8use fast_html5ever::{driver, serialize};
9use tendril::TendrilSink;
10
11use crate::element_ref::ElementRef;
12use crate::node::Node;
13use crate::selector::Selector;
14
15lazy_static! {
16    static ref HTML_SELECTOR: Selector = Selector::parse("html").unwrap();
17}
18
19/// An HTML tree.
20///
21/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
22/// `errors` field. The `tree` will still be populated as best as possible.
23///
24/// Implements the `TreeSink` trait from the `fast_html5ever` crate, which allows HTML to be parsed.
25#[derive(Debug, Clone)]
26pub struct Html {
27    /// The quirks mode.
28    pub quirks_mode: QuirksMode,
29    /// The node tree.
30    pub tree: Tree<Node>,
31    /// The html language of the document.
32    pub lang: String,
33}
34
35impl Html {
36    /// Creates an empty HTML document.
37    pub fn new_document() -> Self {
38        Html {
39            quirks_mode: QuirksMode::NoQuirks,
40            tree: Tree::new(Node::Document),
41            lang: Default::default(),
42        }
43    }
44
45    /// Creates an empty HTML fragment.
46    pub fn new_fragment() -> Self {
47        Html {
48            quirks_mode: QuirksMode::NoQuirks,
49            tree: Tree::new(Node::Fragment),
50            lang: Default::default(),
51        }
52    }
53
54    /// Parses a string of HTML as a document.
55    ///
56    /// This is a convenience method for the following:
57    ///
58    /// ```
59    /// # extern crate fast_html5ever;
60    /// # extern crate tendril;
61    /// # fn main() {
62    /// # let document = "";
63    /// use fast_html5ever::driver::{self, ParseOpts};
64    /// use scraper::Html;
65    /// use tendril::TendrilSink;
66    ///
67    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
68    /// let html = parser.one(document);
69    /// # }
70    /// ```
71    pub fn parse_document(document: &str) -> Self {
72        let parser = driver::parse_document(Self::new_document(), Default::default());
73        parser.one(document)
74    }
75
76    /// Parses a string of HTML as a fragment.
77    pub fn parse_fragment(fragment: &str) -> Self {
78        let parser = driver::parse_fragment(
79            Self::new_fragment(),
80            Default::default(),
81            QualName::new(None, ns!(html), local_name!("body")),
82            Vec::new(),
83        );
84        parser.one(fragment)
85    }
86
87    /// Returns an iterator over elements matching a selector.
88    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
89        Select {
90            inner: self.tree.nodes(),
91            selector,
92        }
93    }
94
95    /// Returns the root `<html>` element.
96    pub fn root_element(&self) -> ElementRef {
97        let root_node = self
98            .tree
99            .root()
100            .children()
101            .find(|child| child.value().is_element())
102            .expect("html node missing");
103        ElementRef::wrap(root_node).unwrap()
104    }
105
106    /// Set the html language of the document by getting the lang attr
107    pub fn set_language(&mut self, lang: String) {
108        self.lang = lang;
109    }
110
111    /// Get the language for the page.
112    pub fn get_lang(&self) -> &str {
113        if self.lang.is_empty() {
114            if let Some(element) = self.select(&HTML_SELECTOR).next() {
115                if let Some(lang) = element.value().attr("lang") {
116                    return lang;
117                }
118            }
119            &self.lang
120        } else {
121            &self.lang
122        }
123    }
124
125    /// Serialize entire document into HTML.
126    pub fn html(&self) -> String {
127        let opts = SerializeOpts {
128            scripting_enabled: false, // It's not clear what this does.
129            traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode,
130            create_missing_parent: false,
131        };
132        let mut buf = Vec::new();
133        let _ = serialize(&mut buf, self, opts);
134        auto_encoder::auto_encode_bytes(&buf)
135    }
136
137    /// Find and remove a node
138    pub fn remove_node(&mut self, node_id: NodeId) {
139        if let Some(mut node) = self.tree.get_mut(node_id) {
140            node.detach();
141        }
142    }
143}
144
145/// Iterator over elements matching a selector.
146#[derive(Debug)]
147pub struct Select<'a, 'b> {
148    inner: Nodes<'a, Node>,
149    selector: &'b Selector,
150}
151
152impl<'a, 'b> Iterator for Select<'a, 'b> {
153    type Item = ElementRef<'a>;
154
155    fn next(&mut self) -> Option<ElementRef<'a>> {
156        for node in self.inner.by_ref() {
157            if let Some(element) = ElementRef::wrap(node) {
158                if element.parent().is_some() && self.selector.matches(&element) {
159                    return Some(element);
160                }
161            }
162        }
163        None
164    }
165}
166
167impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
168    fn next_back(&mut self) -> Option<Self::Item> {
169        for node in self.inner.by_ref().rev() {
170            if let Some(element) = ElementRef::wrap(node) {
171                if element.parent().is_some() && self.selector.matches(&element) {
172                    return Some(element);
173                }
174            }
175        }
176        None
177    }
178}
179
180mod serializable;
181mod tree_sink;
182
183#[cfg(test)]
184mod tests {
185    use super::Html;
186    use super::Selector;
187
188    #[test]
189    fn root_element_fragment() {
190        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
191        let root_ref = html.root_element();
192        let href = root_ref
193            .select(&Selector::parse("a").unwrap())
194            .next()
195            .unwrap();
196        assert_eq!(href.inner_html(), "1");
197        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
198    }
199
200    #[test]
201    fn root_element_document_doctype() {
202        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
203        let root_ref = html.root_element();
204        let title = root_ref
205            .select(&Selector::parse("title").unwrap())
206            .next()
207            .unwrap();
208        assert_eq!(title.inner_html(), "abc");
209    }
210
211    #[test]
212    fn root_element_document_comment() {
213        let html = Html::parse_document("<!-- comment --><title>abc</title>");
214        let root_ref = html.root_element();
215        let title = root_ref
216            .select(&Selector::parse("title").unwrap())
217            .next()
218            .unwrap();
219        assert_eq!(title.inner_html(), "abc");
220    }
221
222    #[test]
223    fn select_is_reversible() {
224        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
225        let selector = Selector::parse("p").unwrap();
226        let result: Vec<_> = html
227            .select(&selector)
228            .rev()
229            .map(|e| e.inner_html())
230            .collect();
231        assert_eq!(result, vec!["element3", "element2", "element1"]);
232    }
233}