accessibility_scraper/html/
mod.rs1#[cfg(feature = "errors")]
4use std::borrow::Cow;
5
6use ego_tree::iter::Nodes;
7use ego_tree::Tree;
8use fast_html5ever::serialize::SerializeOpts;
9use fast_html5ever::tree_builder::QuirksMode;
10use fast_html5ever::QualName;
11use fast_html5ever::{driver, serialize};
12use tendril::TendrilSink;
13
14use crate::selector::Selector;
15use crate::{ElementRef, Node};
16
17#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct Html {
25    #[cfg(feature = "errors")]
26    pub errors: Vec<Cow<'static, str>>,
28
29    pub quirks_mode: QuirksMode,
31
32    pub tree: Tree<Node>,
34}
35
36impl Html {
37    pub fn new_document() -> Self {
39        Html {
40            #[cfg(feature = "errors")]
41            errors: Vec::new(),
42            quirks_mode: QuirksMode::NoQuirks,
43            tree: Tree::new(Node::Document),
44        }
45    }
46
47    pub fn new_fragment() -> Self {
49        Html {
50            #[cfg(feature = "errors")]
51            errors: Vec::new(),
52            quirks_mode: QuirksMode::NoQuirks,
53            tree: Tree::new(Node::Fragment),
54        }
55    }
56
57    #[cfg(all(feature = "tokio", not(feature = "spider")))]
80    pub async fn parse_document(document: &str) -> Self {
81        use tokio_stream::{self as stream, StreamExt};
82        let mut parser = driver::parse_document(Self::new_document(), Default::default());
83        let stream = stream::iter(document.lines());
84        tokio::pin!(stream);
85
86        while let Some(item) = stream.next().await {
87            parser.process(item.into())
88        }
89        parser.finish()
90    }
91
92    #[cfg(feature = "spider")]
115    pub async fn parse_document(document: &str) -> Self {
116        let parser = driver::parse_document(Self::new_document(), Default::default());
117        parser.one(document)
118    }
119
120    #[cfg(not(feature = "tokio"))]
139    pub fn parse_document(document: &str) -> Self {
140        let parser = driver::parse_document(Self::new_document(), Default::default());
141        parser.one(document)
142    }
143
144    pub fn parse_fragment(fragment: &str) -> Self {
146        let parser = driver::parse_fragment(
147            Self::new_fragment(),
148            Default::default(),
149            QualName::new(None, ns!(html), local_name!("body")),
150            Vec::new(),
151        );
152        parser.one(fragment)
153    }
154
155    pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
157        Select {
158            inner: self.tree.nodes(),
159            selector,
160        }
161    }
162
163    pub fn root_element(&self) -> ElementRef {
165        let root_node = self
166            .tree
167            .root()
168            .children()
169            .find(|child| child.value().is_element())
170            .expect("html node missing");
171        ElementRef::wrap(root_node).unwrap()
172    }
173
174    pub fn html(&self) -> String {
176        let opts = SerializeOpts {
177            scripting_enabled: false, traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode,
179            create_missing_parent: false,
180        };
181        let mut buf = Vec::new();
182        serialize(&mut buf, self, opts).unwrap();
183        String::from_utf8(buf).unwrap()
184    }
185}
186
187#[derive(Debug)]
189pub struct Select<'a, 'b> {
190    inner: Nodes<'a, Node>,
191    selector: &'b Selector,
192}
193
194impl<'a, 'b> Iterator for Select<'a, 'b> {
195    type Item = ElementRef<'a>;
196
197    fn next(&mut self) -> Option<ElementRef<'a>> {
198        for node in self.inner.by_ref() {
199            if let Some(element) = ElementRef::wrap(node) {
200                if element.parent().is_some() && self.selector.matches(&element) {
201                    return Some(element);
202                }
203            }
204        }
205        None
206    }
207
208    fn size_hint(&self) -> (usize, Option<usize>) {
209        let (_lower, upper) = self.inner.size_hint();
210
211        (0, upper)
212    }
213}
214
215impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
216    fn next_back(&mut self) -> Option<Self::Item> {
217        for node in self.inner.by_ref().rev() {
218            if let Some(element) = ElementRef::wrap(node) {
219                if element.parent().is_some() && self.selector.matches(&element) {
220                    return Some(element);
221                }
222            }
223        }
224        None
225    }
226}
227
228mod serializable;
229mod tree_sink;
230
231#[cfg(test)]
232mod tests {
233    use super::Html;
234    use super::Selector;
235
236    #[test]
237    #[cfg(not(feature = "tokio"))]
238    fn root_element_fragment() {
239        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
240        let root_ref = html.root_element();
241        let href = root_ref
242            .select(&Selector::parse("a").unwrap())
243            .next()
244            .unwrap();
245        assert_eq!(href.inner_html(), "1");
246        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
247    }
248
249    #[tokio::test]
250    #[cfg(feature = "tokio")]
251    async fn root_element_fragment() {
252        let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
253        let root_ref = html.root_element();
254        let href = root_ref
255            .select(&Selector::parse("a").unwrap())
256            .next()
257            .unwrap();
258        assert_eq!(href.inner_html(), "1");
259        assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
260    }
261
262    #[test]
263    #[cfg(not(feature = "tokio"))]
264    fn root_element_document_doctype() {
265        let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
266        let root_ref = html.root_element();
267        let title = root_ref
268            .select(&Selector::parse("title").unwrap())
269            .next()
270            .unwrap();
271        assert_eq!(title.inner_html(), "abc");
272    }
273
274    #[test]
275    #[cfg(not(feature = "tokio"))]
276    fn root_element_document_comment() {
277        let html = Html::parse_document("<!-- comment --><title>abc</title>");
278        let root_ref = html.root_element();
279        let title = root_ref
280            .select(&Selector::parse("title").unwrap())
281            .next()
282            .unwrap();
283        assert_eq!(title.inner_html(), "abc");
284    }
285
286    #[test]
287    #[cfg(not(feature = "tokio"))]
288    fn select_is_reversible() {
289        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
290        let selector = Selector::parse("p").unwrap();
291        let result: Vec<_> = html
292            .select(&selector)
293            .rev()
294            .map(|e| e.inner_html())
295            .collect();
296        assert_eq!(result, vec!["element3", "element2", "element1"]);
297    }
298
299    #[test]
300    #[cfg(not(feature = "tokio"))]
301    fn select_has_a_size_hint() {
302        let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
303        let selector = Selector::parse("p").unwrap();
304        let (lower, upper) = html.select(&selector).size_hint();
305        assert_eq!(lower, 0);
306        assert_eq!(upper, Some(10));
307    }
308
309    #[cfg(feature = "atomic")]
310    #[cfg(not(feature = "tokio"))]
311    #[test]
312    fn html_is_send() {
313        fn send_sync<S: Send>() {}
314        send_sync::<Html>();
315    }
316}