Skip to main content

scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3#[cfg(feature = "errors")]
4use std::borrow::Cow;
5use std::fmt;
6use std::iter::FusedIterator;
7
8use ego_tree::Tree;
9use ego_tree::iter::Nodes;
10use html5ever::serialize::SerializeOpts;
11use html5ever::tree_builder::QuirksMode;
12use html5ever::{QualName, driver, serialize};
13use selectors::matching::SelectorCaches;
14use tendril::TendrilSink;
15
16use crate::selector::Selector;
17use crate::{ElementRef, Node};
18
19pub use tree_sink::HtmlTreeSink;
20
21/// An HTML tree.
22///
23/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
24/// `errors` field. The `tree` will still be populated as best as possible.
25///
26/// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed.
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct Html {
29    #[cfg(feature = "errors")]
30    /// Parse errors.
31    pub errors: Vec<Cow<'static, str>>,
32
33    /// The quirks mode.
34    pub quirks_mode: QuirksMode,
35
36    /// The node tree.
37    pub tree: Tree<Node>,
38}
39
40impl Html {
41    /// Creates an empty HTML document.
42    pub fn new_document() -> Self {
43        Html {
44            #[cfg(feature = "errors")]
45            errors: Vec::new(),
46            quirks_mode: QuirksMode::NoQuirks,
47            tree: Tree::new(Node::Document),
48        }
49    }
50
51    /// Creates an empty HTML fragment.
52    pub fn new_fragment() -> Self {
53        Html {
54            #[cfg(feature = "errors")]
55            errors: Vec::new(),
56            quirks_mode: QuirksMode::NoQuirks,
57            tree: Tree::new(Node::Fragment),
58        }
59    }
60
61    /// Parses a string of HTML as a document.
62    ///
63    /// This is a convenience method for the following:
64    ///
65    /// ```
66    /// # extern crate html5ever;
67    /// # extern crate scraper;
68    /// # extern crate tendril;
69    /// # fn main() {
70    /// # let document = "";
71    /// use html5ever::driver::{self, ParseOpts};
72    /// use scraper::{Html, HtmlTreeSink};
73    /// use tendril::TendrilSink;
74    ///
75    /// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default());
76    /// let html = parser.one(document);
77    /// # }
78    /// ```
79    pub fn parse_document(document: &str) -> Self {
80        let parser =
81            driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default());
82        parser.one(document)
83    }
84
85    /// Parses a string of HTML as a fragment.
86    pub fn parse_fragment(fragment: &str) -> Self {
87        let parser = driver::parse_fragment(
88            HtmlTreeSink::new(Self::new_fragment()),
89            Default::default(),
90            QualName::new(None, ns!(html), local_name!("body")),
91            Vec::new(),
92            false,
93        );
94        parser.one(fragment)
95    }
96
97    /// Returns an iterator over elements matching a selector.
98    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
99        Select {
100            inner: self.tree.nodes(),
101            selector,
102            caches: Default::default(),
103        }
104    }
105
106    /// Returns the root `<html>` element.
107    pub fn root_element(&self) -> ElementRef<'_> {
108        let root_node = self
109            .tree
110            .root()
111            .children()
112            .find(|child| child.value().is_element())
113            .expect("html node missing");
114        ElementRef::wrap(root_node).unwrap()
115    }
116
117    /// Serialize entire document into HTML.
118    pub fn html(&self) -> String {
119        let opts = SerializeOpts {
120            scripting_enabled: false, // It's not clear what this does.
121            traversal_scope: serialize::TraversalScope::IncludeNode,
122            create_missing_parent: false,
123        };
124        let mut buf = Vec::new();
125        serialize(&mut buf, self, opts).unwrap();
126        String::from_utf8(buf).unwrap()
127    }
128}
129
130/// Iterator over elements matching a selector.
131pub struct Select<'a, 'b> {
132    inner: Nodes<'a, Node>,
133    selector: &'b Selector,
134    caches: SelectorCaches,
135}
136
137impl fmt::Debug for Select<'_, '_> {
138    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
139        fmt.debug_struct("Select")
140            .field("inner", &self.inner)
141            .field("selector", &self.selector)
142            .field("caches", &"..")
143            .finish()
144    }
145}
146
147impl Clone for Select<'_, '_> {
148    fn clone(&self) -> Self {
149        Self {
150            inner: self.inner.clone(),
151            selector: self.selector,
152            caches: Default::default(),
153        }
154    }
155}
156
157impl<'a> Iterator for Select<'a, '_> {
158    type Item = ElementRef<'a>;
159
160    fn next(&mut self) -> Option<ElementRef<'a>> {
161        for node in self.inner.by_ref() {
162            if let Some(element) = ElementRef::wrap(node)
163                && element.parent().is_some()
164                && self
165                    .selector
166                    .matches_with_scope_and_cache(&element, None, &mut self.caches)
167            {
168                return Some(element);
169            }
170        }
171        None
172    }
173
174    fn size_hint(&self) -> (usize, Option<usize>) {
175        let (_lower, upper) = self.inner.size_hint();
176
177        (0, upper)
178    }
179}
180
181impl DoubleEndedIterator for Select<'_, '_> {
182    fn next_back(&mut self) -> Option<Self::Item> {
183        for node in self.inner.by_ref().rev() {
184            if let Some(element) = ElementRef::wrap(node)
185                && element.parent().is_some()
186                && self
187                    .selector
188                    .matches_with_scope_and_cache(&element, None, &mut self.caches)
189            {
190                return Some(element);
191            }
192        }
193        None
194    }
195}
196
197impl FusedIterator for Select<'_, '_> {}
198
199mod serializable;
200mod tree_sink;
201
202#[cfg(test)]
203mod tests {
204    use super::Html;
205    use super::Selector;
206
207    #[test]
208    fn root_element_fragment() {
209        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
210        let root_ref = html.root_element();
211        let href = root_ref
212            .select(&Selector::parse("a").unwrap())
213            .next()
214            .unwrap();
215        assert_eq!(href.inner_html(), "1");
216        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
217    }
218
219    #[test]
220    fn root_element_document_doctype() {
221        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
222        let root_ref = html.root_element();
223        let title = root_ref
224            .select(&Selector::parse("title").unwrap())
225            .next()
226            .unwrap();
227        assert_eq!(title.inner_html(), "abc");
228    }
229
230    #[test]
231    fn root_element_document_comment() {
232        let html = Html::parse_document("<!-- comment --><title>abc</title>");
233        let root_ref = html.root_element();
234        let title = root_ref
235            .select(&Selector::parse("title").unwrap())
236            .next()
237            .unwrap();
238        assert_eq!(title.inner_html(), "abc");
239    }
240
241    #[test]
242    fn select_is_reversible() {
243        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
244        let selector = Selector::parse("p").unwrap();
245        let result: Vec<_> = html
246            .select(&selector)
247            .rev()
248            .map(|e| e.inner_html())
249            .collect();
250        assert_eq!(result, vec!["element3", "element2", "element1"]);
251    }
252
253    #[test]
254    fn select_has_a_size_hint() {
255        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
256        let selector = Selector::parse("p").unwrap();
257        let (lower, upper) = html.select(&selector).size_hint();
258        assert_eq!(lower, 0);
259        assert_eq!(upper, Some(10));
260    }
261
262    #[cfg(feature = "atomic")]
263    #[test]
264    fn html_is_send() {
265        fn send_sync<S: Send>() {}
266        send_sync::<Html>();
267    }
268}