Skip to main content

scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3use ego_tree::iter::Nodes;
4use ego_tree::{NodeId, Tree};
5use html5ever::serialize::SerializeOpts;
6use html5ever::tree_builder::QuirksMode;
7use html5ever::QualName;
8use html5ever::{driver, serialize};
9use tendril::TendrilSink;
10
11use crate::element_ref::ElementRef;
12use crate::node::Node;
13use crate::selector::Selector;
14
15use self::tree_sink::HtmlBuilder;
16
17lazy_static! {
18    static ref HTML_SELECTOR: Selector = Selector::parse("html").unwrap();
19}
20
21/// An HTML tree.
22///
23/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
24/// `errors` field. The `tree` will still be populated as best as possible.
25///
26/// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed.
27#[derive(Debug, Clone)]
28pub struct Html {
29    /// The quirks mode.
30    pub quirks_mode: QuirksMode,
31    /// The node tree.
32    pub tree: Tree<Node>,
33    /// The html language of the document.
34    pub lang: String,
35}
36
37impl Html {
38    /// Creates an empty HTML document.
39    pub fn new_document() -> Self {
40        Html {
41            quirks_mode: QuirksMode::NoQuirks,
42            tree: Tree::new(Node::Document),
43            lang: Default::default(),
44        }
45    }
46
47    /// Creates an empty HTML fragment.
48    pub fn new_fragment() -> Self {
49        Html {
50            quirks_mode: QuirksMode::NoQuirks,
51            tree: Tree::new(Node::Fragment),
52            lang: Default::default(),
53        }
54    }
55
56    /// Parses a string of HTML as a document.
57    ///
58    /// This is a convenience method for the following:
59    ///
60    /// ```
61    /// # extern crate html5ever;
62    /// # extern crate tendril;
63    /// # fn main() {
64    /// # let document = "";
65    /// use html5ever::driver::{self, ParseOpts};
66    /// use scraper::Html;
67    /// use tendril::TendrilSink;
68    ///
69    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
70    /// let html = parser.one(document);
71    /// # }
72    /// ```
73    pub fn parse_document(document: &str) -> Self {
74        let parser = driver::parse_document(HtmlBuilder::new_document(), Default::default());
75        parser.one(document)
76    }
77
78    /// Parses a string of HTML as a fragment.
79    pub fn parse_fragment(fragment: &str) -> Self {
80        let parser = driver::parse_fragment(
81            HtmlBuilder::new_fragment(),
82            Default::default(),
83            QualName::new(None, ns!(html), local_name!("body")),
84            Vec::new(),
85            false,
86        );
87        parser.one(fragment)
88    }
89
90    /// Returns an iterator over elements matching a selector.
91    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
92        Select {
93            inner: self.tree.nodes(),
94            selector,
95        }
96    }
97
98    /// Returns the root `<html>` element.
99    pub fn root_element(&self) -> ElementRef {
100        let root_node = self
101            .tree
102            .root()
103            .children()
104            .find(|child| child.value().is_element())
105            .expect("html node missing");
106        ElementRef::wrap(root_node).unwrap()
107    }
108
109    /// Set the html language of the document by getting the lang attr
110    pub fn set_language(&mut self, lang: String) {
111        self.lang = lang;
112    }
113
114    /// Get the language for the page.
115    pub fn get_lang(&self) -> &str {
116        if self.lang.is_empty() {
117            if let Some(element) = self.select(&HTML_SELECTOR).next() {
118                if let Some(lang) = element.value().attr("lang") {
119                    return lang;
120                }
121            }
122            &self.lang
123        } else {
124            &self.lang
125        }
126    }
127
128    /// Serialize entire document into HTML.
129    pub fn html(&self) -> String {
130        let opts = SerializeOpts {
131            scripting_enabled: false, // It's not clear what this does.
132            traversal_scope: html5ever::serialize::TraversalScope::IncludeNode,
133            create_missing_parent: false,
134        };
135        let mut buf = Vec::new();
136        let _ = serialize(&mut buf, self, opts);
137        auto_encoder::auto_encode_bytes(&buf)
138    }
139
140    /// Find and remove a node
141    pub fn remove_node(&mut self, node_id: NodeId) {
142        if let Some(mut node) = self.tree.get_mut(node_id) {
143            node.detach();
144        }
145    }
146}
147
148/// Iterator over elements matching a selector.
149#[derive(Debug)]
150pub struct Select<'a, 'b> {
151    inner: Nodes<'a, Node>,
152    selector: &'b Selector,
153}
154
155impl<'a, 'b> Iterator for Select<'a, 'b> {
156    type Item = ElementRef<'a>;
157
158    fn next(&mut self) -> Option<ElementRef<'a>> {
159        for node in self.inner.by_ref() {
160            if let Some(element) = ElementRef::wrap(node) {
161                if element.parent().is_some() && self.selector.matches(&element) {
162                    return Some(element);
163                }
164            }
165        }
166        None
167    }
168}
169
170impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
171    fn next_back(&mut self) -> Option<Self::Item> {
172        for node in self.inner.by_ref().rev() {
173            if let Some(element) = ElementRef::wrap(node) {
174                if element.parent().is_some() && self.selector.matches(&element) {
175                    return Some(element);
176                }
177            }
178        }
179        None
180    }
181}
182
183mod serializable;
184mod tree_sink;
185
186#[cfg(test)]
187mod tests {
188    use super::Html;
189    use super::Selector;
190
191    /// Compile-time assertion that the parsed `Html` is `Send`.
192    /// This is the whole point of the spider-html5ever / spider-tendril
193    /// fork swap — `Html` (and the futures that hold it) can now move
194    /// across thread boundaries on a multi-threaded async runtime.
195    ///
196    /// `Sync` is NOT asserted: `Tendril` contains a `Cell<NonZeroUsize>`
197    /// pointer field that is intentionally `!Sync`. Spider_scraper owns
198    /// its tree directly (no `Arc`), so `Send` is the only bound we need
199    /// for cross-thread movement.
200    #[test]
201    fn parsed_html_is_send() {
202        fn assert_send<T: Send>(_: &T) {}
203        let html = Html::parse_document("<p>hi</p>");
204        assert_send(&html);
205    }
206
207    #[test]
208    fn root_element_fragment() {
209        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
210        let root_ref = html.root_element();
211        let href = root_ref
212            .select(&Selector::parse("a").unwrap())
213            .next()
214            .unwrap();
215        assert_eq!(href.inner_html(), "1");
216        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
217    }
218
219    #[test]
220    fn root_element_document_doctype() {
221        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
222        let root_ref = html.root_element();
223        let title = root_ref
224            .select(&Selector::parse("title").unwrap())
225            .next()
226            .unwrap();
227        assert_eq!(title.inner_html(), "abc");
228    }
229
230    #[test]
231    fn root_element_document_comment() {
232        let html = Html::parse_document("<!-- comment --><title>abc</title>");
233        let root_ref = html.root_element();
234        let title = root_ref
235            .select(&Selector::parse("title").unwrap())
236            .next()
237            .unwrap();
238        assert_eq!(title.inner_html(), "abc");
239    }
240
241    #[test]
242    fn select_is_reversible() {
243        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
244        let selector = Selector::parse("p").unwrap();
245        let result: Vec<_> = html
246            .select(&selector)
247            .rev()
248            .map(|e| e.inner_html())
249            .collect();
250        assert_eq!(result, vec!["element3", "element2", "element1"]);
251    }
252}