fast_scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3use std::borrow::Cow;
4
5use ego_tree::iter::Nodes;
6use ego_tree::Tree;
7use html5ever::serialize::SerializeOpts;
8use html5ever::tree_builder::QuirksMode;
9use html5ever::QualName;
10use html5ever::{driver, serialize};
11use tendril::TendrilSink;
12
13use crate::selector::Selector;
14use crate::{ElementRef, Node};
15
16/// An HTML tree.
17///
18/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
19/// `errors` field. The `tree` will still be populated as best as possible.
20///
21/// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct Html {
24    /// Parse errors.
25    pub errors: Vec<Cow<'static, str>>,
26
27    /// The quirks mode.
28    pub quirks_mode: QuirksMode,
29
30    /// The node tree.
31    pub tree: Tree<Node>,
32}
33
34impl Html {
35    /// Creates an empty HTML document.
36    pub fn new_document() -> Self {
37        Html {
38            errors: Vec::new(),
39            quirks_mode: QuirksMode::NoQuirks,
40            tree: Tree::new(Node::Document),
41        }
42    }
43
44    /// Creates an empty HTML fragment.
45    pub fn new_fragment() -> Self {
46        Html {
47            errors: Vec::new(),
48            quirks_mode: QuirksMode::NoQuirks,
49            tree: Tree::new(Node::Fragment),
50        }
51    }
52
53    /// Parses a string of HTML as a document.
54    ///
55    /// This is a convenience method for the following:
56    ///
57    /// ```
58    /// # extern crate html5ever;
59    /// # extern crate fast_scraper;
60    /// # extern crate tendril;
61    /// # fn main() {
62    /// # let document = "";
63    /// use html5ever::driver::{self, ParseOpts};
64    /// use fast_scraper::Html;
65    /// use tendril::TendrilSink;
66    ///
67    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
68    /// let html = parser.one(document);
69    /// # }
70    /// ```
71    pub fn parse_document(document: &str) -> Self {
72        let parser = driver::parse_document(Self::new_document(), Default::default());
73        parser.one(document)
74    }
75
76    /// Parses a string of HTML as a fragment.
77    pub fn parse_fragment(fragment: &str) -> Self {
78        let parser = driver::parse_fragment(
79            Self::new_fragment(),
80            Default::default(),
81            QualName::new(None, ns!(html), local_name!("body")),
82            Vec::new(),
83        );
84        parser.one(fragment)
85    }
86
87    /// Returns an iterator over elements matching a selector.
88    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
89        Select {
90            inner: self.tree.nodes(),
91            selector,
92        }
93    }
94
95    /// Returns the root `<html>` element.
96    pub fn root_element(&self) -> ElementRef {
97        let root_node = self
98            .tree
99            .root()
100            .children()
101            .find(|child| child.value().is_element())
102            .expect("html node missing");
103        ElementRef::wrap(root_node).unwrap()
104    }
105
106    /// Serialize entire document into HTML.
107    pub fn html(&self) -> String {
108        let opts = SerializeOpts {
109            scripting_enabled: false, // It's not clear what this does.
110            traversal_scope: html5ever::serialize::TraversalScope::IncludeNode,
111            create_missing_parent: false,
112        };
113        let mut buf = Vec::new();
114        serialize(&mut buf, self, opts).unwrap();
115        String::from_utf8(buf).unwrap()
116    }
117}
118
119/// Iterator over elements matching a selector.
120#[derive(Debug)]
121pub struct Select<'a, 'b> {
122    inner: Nodes<'a, Node>,
123    selector: &'b Selector,
124}
125
126impl<'a, 'b> Iterator for Select<'a, 'b> {
127    type Item = ElementRef<'a>;
128
129    fn next(&mut self) -> Option<ElementRef<'a>> {
130        for node in self.inner.by_ref() {
131            if let Some(element) = ElementRef::wrap(node) {
132                if element.parent().is_some() && self.selector.matches(&element) {
133                    return Some(element);
134                }
135            }
136        }
137        None
138    }
139}
140
141impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
142    fn next_back(&mut self) -> Option<Self::Item> {
143        for node in self.inner.by_ref().rev() {
144            if let Some(element) = ElementRef::wrap(node) {
145                if element.parent().is_some() && self.selector.matches(&element) {
146                    return Some(element);
147                }
148            }
149        }
150        None
151    }
152}
153
154mod serializable;
155mod tree_sink;
156
157#[cfg(test)]
158mod tests {
159    use super::Html;
160    use super::Selector;
161
162    #[test]
163    fn root_element_fragment() {
164        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
165        let root_ref = html.root_element();
166        let href = root_ref
167            .select(&Selector::parse("a").unwrap())
168            .next()
169            .unwrap();
170        assert_eq!(href.inner_html(), "1");
171        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
172    }
173
174    #[test]
175    fn root_element_document_doctype() {
176        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
177        let root_ref = html.root_element();
178        let title = root_ref
179            .select(&Selector::parse("title").unwrap())
180            .next()
181            .unwrap();
182        assert_eq!(title.inner_html(), "abc");
183    }
184
185    #[test]
186    fn root_element_document_comment() {
187        let html = Html::parse_document("<!-- comment --><title>abc</title>");
188        let root_ref = html.root_element();
189        let title = root_ref
190            .select(&Selector::parse("title").unwrap())
191            .next()
192            .unwrap();
193        assert_eq!(title.inner_html(), "abc");
194    }
195
196    #[test]
197    fn select_is_reversible() {
198        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
199        let selector = Selector::parse("p").unwrap();
200        let result: Vec<_> = html
201            .select(&selector)
202            .rev()
203            .map(|e| e.inner_html())
204            .collect();
205        assert_eq!(result, vec!["element3", "element2", "element1"]);
206    }
207}