dom_manipulator/html/
mod.rs

1//! HTML documents and fragments.
2
3use std::borrow::Cow;
4
5use ego_tree::iter::Nodes;
6use ego_tree::Tree;
7use html5ever::serialize::SerializeOpts;
8use html5ever::tree_builder::QuirksMode;
9use html5ever::QualName;
10use html5ever::{driver, serialize};
11use tendril::TendrilSink;
12
13use crate::selector::Selector;
14use crate::{ElementRef, Node};
15
16/// An HTML tree.
17///
18/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
19/// `errors` field. The `tree` will still be populated as best as possible.
20///
21/// Implements the `TreeSink` trait from the `html5ever` crate, which allows HTML to be parsed.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct Html {
24    /// Parse errors.
25    pub errors: Vec<Cow<'static, str>>,
26
27    /// The quirks mode.
28    pub quirks_mode: QuirksMode,
29
30    /// The node tree.
31    pub tree: Tree<Node>,
32}
33
34impl Html {
35    /// Creates an empty HTML document.
36    pub fn new_document() -> Self {
37        Html {
38            errors: Vec::new(),
39            quirks_mode: QuirksMode::NoQuirks,
40            tree: Tree::new(Node::Document),
41        }
42    }
43
44    /// Creates an empty HTML fragment.
45    pub fn new_fragment() -> Self {
46        Html {
47            errors: Vec::new(),
48            quirks_mode: QuirksMode::NoQuirks,
49            tree: Tree::new(Node::Fragment),
50        }
51    }
52
53    /// Parses a string of HTML as a document.
54    ///
55    /// This is a convenience method for the following:
56    ///
57    /// ```
58    /// # extern crate html5ever;
59    /// # extern crate dom_manipulator;
60    /// # extern crate tendril;
61    /// # fn main() {
62    /// # let document = "";
63    /// use html5ever::driver::{self, ParseOpts};
64    /// use scraper::Html;
65    /// use tendril::TendrilSink;
66    ///
67    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
68    /// let html = parser.one(document);
69    /// # }
70    /// ```
71    pub fn parse_document(document: &str) -> Self {
72        let parser = driver::parse_document(Self::new_document(), Default::default());
73        parser.one(document)
74    }
75
76    /// Parses a string of HTML as a fragment.
77    pub fn parse_fragment(fragment: &str) -> Self {
78        let parser = driver::parse_fragment(
79            Self::new_fragment(),
80            Default::default(),
81            QualName::new(None, ns!(html), local_name!("body")),
82            Vec::new(),
83        );
84        parser.one(fragment)
85    }
86
87    /// Returns an iterator over elements matching a selector.
88    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
89        Select {
90            inner: self.tree.nodes(),
91            selector,
92        }
93    }
94
95    /// Returns the root `<html>` element.
96    pub fn root_element(&self) -> ElementRef {
97        let root_node = self
98            .tree
99            .root()
100            .children()
101            .find(|child| child.value().is_element())
102            .expect("html node missing");
103        ElementRef::wrap(root_node).unwrap()
104    }
105
106    /// Serialize entire document into HTML.
107    pub fn html(&self) -> String {
108        let opts = SerializeOpts {
109            scripting_enabled: false, // It's not clear what this does.
110            traversal_scope: html5ever::serialize::TraversalScope::IncludeNode,
111            create_missing_parent: false,
112        };
113        let mut buf = Vec::new();
114        serialize(&mut buf, self, opts).unwrap();
115        String::from_utf8(buf).unwrap()
116    }
117}
118
119/// Iterator over elements matching a selector.
120#[derive(Debug)]
121pub struct Select<'a, 'b> {
122    inner: Nodes<'a, Node>,
123    selector: &'b Selector,
124}
125
126impl<'a, 'b> Iterator for Select<'a, 'b> {
127    type Item = ElementRef<'a>;
128
129    fn next(&mut self) -> Option<ElementRef<'a>> {
130        for node in self.inner.by_ref() {
131            if let Some(element) = ElementRef::wrap(node) {
132                if element.parent().is_some() && self.selector.matches(&element) {
133                    return Some(element);
134                }
135            }
136        }
137        None
138    }
139}
140
141mod serializable;
142mod tree_sink;
143
144#[cfg(test)]
145mod tests {
146    use super::Html;
147    use super::Selector;
148
149    #[test]
150    fn root_element_fragment() {
151        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
152        let root_ref = html.root_element();
153        let href = root_ref
154            .select(&Selector::parse("a").unwrap())
155            .next()
156            .unwrap();
157        assert_eq!(href.inner_html(), "1");
158        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
159    }
160
161    #[test]
162    fn root_element_document_doctype() {
163        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
164        let root_ref = html.root_element();
165        let title = root_ref
166            .select(&Selector::parse("title").unwrap())
167            .next()
168            .unwrap();
169        assert_eq!(title.inner_html(), "abc");
170    }
171
172    #[test]
173    fn root_element_document_comment() {
174        let html = Html::parse_document("<!-- comment --><title>abc</title>");
175        let root_ref = html.root_element();
176        let title = root_ref
177            .select(&Selector::parse("title").unwrap())
178            .next()
179            .unwrap();
180        assert_eq!(title.inner_html(), "abc");
181    }
182}