accessibility_scraper/html/
mod.rs

1//! HTML documents and fragments.
2
3#[cfg(feature = "errors")]
4use std::borrow::Cow;
5
6use ego_tree::iter::Nodes;
7use ego_tree::Tree;
8use fast_html5ever::serialize::SerializeOpts;
9use fast_html5ever::tree_builder::QuirksMode;
10use fast_html5ever::QualName;
11use fast_html5ever::{driver, serialize};
12use tendril::TendrilSink;
13
14use crate::selector::Selector;
15use crate::{ElementRef, Node};
16
17/// An HTML tree.
18///
19/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
20/// `errors` field. The `tree` will still be populated as best as possible.
21///
22/// Implements the `TreeSink` trait from the `fast_html5ever` crate, which allows HTML to be parsed.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct Html {
25    #[cfg(feature = "errors")]
26    /// Parse errors.
27    pub errors: Vec<Cow<'static, str>>,
28
29    /// The quirks mode.
30    pub quirks_mode: QuirksMode,
31
32    /// The node tree.
33    pub tree: Tree<Node>,
34}
35
36impl Html {
37    /// Creates an empty HTML document.
38    pub fn new_document() -> Self {
39        Html {
40            #[cfg(feature = "errors")]
41            errors: Vec::new(),
42            quirks_mode: QuirksMode::NoQuirks,
43            tree: Tree::new(Node::Document),
44        }
45    }
46
47    /// Creates an empty HTML fragment.
48    pub fn new_fragment() -> Self {
49        Html {
50            #[cfg(feature = "errors")]
51            errors: Vec::new(),
52            quirks_mode: QuirksMode::NoQuirks,
53            tree: Tree::new(Node::Fragment),
54        }
55    }
56
57    /// Parses a string of HTML as a document.
58    ///
59    /// This is a convenience method for the following:
60    ///
61    /// ```
62    /// # extern crate fast_html5ever;
63    /// # extern crate accessibility_scraper;
64    /// # extern crate tendril;
65    ///   use accessibility_scraper::html::Html;
66    ///   use crate::tendril::TendrilSink;
67    ///   #[tokio::main]
68    /// # async fn main() {
69    /// # let document = "";
70    ///    use tokio_stream::{self as stream, StreamExt};
71    ///    let mut parser = fast_html5ever::driver::parse_document(Html::new_document(), Default::default());
72    ///    let mut stream = stream::iter(document.lines());
73    ///    while let Some(item) = stream.next().await {
74    ///        parser.process(item.into())
75    ///    }
76    ///    parser.finish();
77    /// # }
78    /// ```
79    #[cfg(all(feature = "tokio", not(feature = "spider")))]
80    pub async fn parse_document(document: &str) -> Self {
81        use tokio_stream::{self as stream, StreamExt};
82        let mut parser = driver::parse_document(Self::new_document(), Default::default());
83        let stream = stream::iter(document.lines());
84        tokio::pin!(stream);
85
86        while let Some(item) = stream.next().await {
87            parser.process(item.into())
88        }
89        parser.finish()
90    }
91
92    /// Parses a string of HTML as a document.
93    ///
94    /// This is a convenience method for the following:
95    ///
96    /// ```
97    /// # extern crate fast_html5ever;
98    /// # extern crate accessibility_scraper;
99    /// # extern crate tendril;
100    ///   use accessibility_scraper::html::Html;
101    ///   use crate::tendril::TendrilSink;
102    ///   #[tokio::main]
103    /// # async fn main() {
104    /// # let document = "";
105    ///    use tokio_stream::{self as stream, StreamExt};
106    ///    let mut parser = fast_html5ever::driver::parse_document(Html::new_document(), Default::default());
107    ///    let mut stream = stream::iter(document.lines());
108    ///    while let Some(item) = stream.next().await {
109    ///        parser.process(item.into())
110    ///    }
111    ///    parser.finish();
112    /// # }
113    /// ```
114    #[cfg(feature = "spider")]
115    pub async fn parse_document(document: &str) -> Self {
116        let parser = driver::parse_document(Self::new_document(), Default::default());
117        parser.one(document)
118    }
119
120    /// Parses a string of HTML as a document.
121    ///
122    /// This is a convenience method for the following:
123    ///
124    /// ```
125    /// # extern crate fast_html5ever;
126    /// # extern crate accessibility_scraper;
127    /// # extern crate tendril;
128    /// # fn main() {
129    /// # let document = "";
130    /// use fast_html5ever::driver::{self, ParseOpts};
131    /// use accessibility_scraper::Html;
132    /// use tendril::TendrilSink;
133    ///
134    /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
135    /// let html = parser.one(document);
136    /// # }
137    /// ```
138    #[cfg(not(feature = "tokio"))]
139    pub fn parse_document(document: &str) -> Self {
140        let parser = driver::parse_document(Self::new_document(), Default::default());
141        parser.one(document)
142    }
143
144    /// Parses a string of HTML as a fragment.
145    pub fn parse_fragment(fragment: &str) -> Self {
146        let parser = driver::parse_fragment(
147            Self::new_fragment(),
148            Default::default(),
149            QualName::new(None, ns!(html), local_name!("body")),
150            Vec::new(),
151        );
152        parser.one(fragment)
153    }
154
155    /// Returns an iterator over elements matching a selector.
156    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
157        Select {
158            inner: self.tree.nodes(),
159            selector,
160        }
161    }
162
163    /// Returns the root `<html>` element.
164    pub fn root_element(&self) -> ElementRef {
165        let root_node = self
166            .tree
167            .root()
168            .children()
169            .find(|child| child.value().is_element())
170            .expect("html node missing");
171        ElementRef::wrap(root_node).unwrap()
172    }
173
174    /// Serialize entire document into HTML.
175    pub fn html(&self) -> String {
176        let opts = SerializeOpts {
177            scripting_enabled: false, // It's not clear what this does.
178            traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode,
179            create_missing_parent: false,
180        };
181        let mut buf = Vec::new();
182        serialize(&mut buf, self, opts).unwrap();
183        String::from_utf8(buf).unwrap()
184    }
185}
186
187/// Iterator over elements matching a selector.
188#[derive(Debug)]
189pub struct Select<'a, 'b> {
190    inner: Nodes<'a, Node>,
191    selector: &'b Selector,
192}
193
194impl<'a, 'b> Iterator for Select<'a, 'b> {
195    type Item = ElementRef<'a>;
196
197    fn next(&mut self) -> Option<ElementRef<'a>> {
198        for node in self.inner.by_ref() {
199            if let Some(element) = ElementRef::wrap(node) {
200                if element.parent().is_some() && self.selector.matches(&element) {
201                    return Some(element);
202                }
203            }
204        }
205        None
206    }
207
208    fn size_hint(&self) -> (usize, Option<usize>) {
209        let (_lower, upper) = self.inner.size_hint();
210
211        (0, upper)
212    }
213}
214
215impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
216    fn next_back(&mut self) -> Option<Self::Item> {
217        for node in self.inner.by_ref().rev() {
218            if let Some(element) = ElementRef::wrap(node) {
219                if element.parent().is_some() && self.selector.matches(&element) {
220                    return Some(element);
221                }
222            }
223        }
224        None
225    }
226}
227
228mod serializable;
229mod tree_sink;
230
231#[cfg(test)]
232mod tests {
233    use super::Html;
234    use super::Selector;
235
236    #[test]
237    #[cfg(not(feature = "tokio"))]
238    fn root_element_fragment() {
239        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
240        let root_ref = html.root_element();
241        let href = root_ref
242            .select(&Selector::parse("a").unwrap())
243            .next()
244            .unwrap();
245        assert_eq!(href.inner_html(), "1");
246        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
247    }
248
249    #[tokio::test]
250    #[cfg(feature = "tokio")]
251    async fn root_element_fragment() {
252        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
253        let root_ref = html.root_element();
254        let href = root_ref
255            .select(&Selector::parse("a").unwrap())
256            .next()
257            .unwrap();
258        assert_eq!(href.inner_html(), "1");
259        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
260    }
261
262    #[test]
263    #[cfg(not(feature = "tokio"))]
264    fn root_element_document_doctype() {
265        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
266        let root_ref = html.root_element();
267        let title = root_ref
268            .select(&Selector::parse("title").unwrap())
269            .next()
270            .unwrap();
271        assert_eq!(title.inner_html(), "abc");
272    }
273
274    #[test]
275    #[cfg(not(feature = "tokio"))]
276    fn root_element_document_comment() {
277        let html = Html::parse_document("<!-- comment --><title>abc</title>");
278        let root_ref = html.root_element();
279        let title = root_ref
280            .select(&Selector::parse("title").unwrap())
281            .next()
282            .unwrap();
283        assert_eq!(title.inner_html(), "abc");
284    }
285
286    #[test]
287    #[cfg(not(feature = "tokio"))]
288    fn select_is_reversible() {
289        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
290        let selector = Selector::parse("p").unwrap();
291        let result: Vec<_> = html
292            .select(&selector)
293            .rev()
294            .map(|e| e.inner_html())
295            .collect();
296        assert_eq!(result, vec!["element3", "element2", "element1"]);
297    }
298
299    #[test]
300    #[cfg(not(feature = "tokio"))]
301    fn select_has_a_size_hint() {
302        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
303        let selector = Selector::parse("p").unwrap();
304        let (lower, upper) = html.select(&selector).size_hint();
305        assert_eq!(lower, 0);
306        assert_eq!(upper, Some(10));
307    }
308
309    #[cfg(feature = "atomic")]
310    #[cfg(not(feature = "tokio"))]
311    #[test]
312    fn html_is_send() {
313        fn send_sync<S: Send>() {}
314        send_sync::<Html>();
315    }
316}