accessibility_scraper/html/
mod.rs1#[cfg(feature = "errors")]
4use std::borrow::Cow;
5
6use ego_tree::iter::Nodes;
7use ego_tree::Tree;
8use fast_html5ever::serialize::SerializeOpts;
9use fast_html5ever::tree_builder::QuirksMode;
10use fast_html5ever::QualName;
11use fast_html5ever::{driver, serialize};
12use tendril::TendrilSink;
13
14use crate::selector::Selector;
15use crate::{ElementRef, Node};
16
17#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct Html {
25 #[cfg(feature = "errors")]
26 pub errors: Vec<Cow<'static, str>>,
28
29 pub quirks_mode: QuirksMode,
31
32 pub tree: Tree<Node>,
34}
35
36impl Html {
37 pub fn new_document() -> Self {
39 Html {
40 #[cfg(feature = "errors")]
41 errors: Vec::new(),
42 quirks_mode: QuirksMode::NoQuirks,
43 tree: Tree::new(Node::Document),
44 }
45 }
46
47 pub fn new_fragment() -> Self {
49 Html {
50 #[cfg(feature = "errors")]
51 errors: Vec::new(),
52 quirks_mode: QuirksMode::NoQuirks,
53 tree: Tree::new(Node::Fragment),
54 }
55 }
56
57 #[cfg(all(feature = "tokio", not(feature = "spider")))]
80 pub async fn parse_document(document: &str) -> Self {
81 use tokio_stream::{self as stream, StreamExt};
82 let mut parser = driver::parse_document(Self::new_document(), Default::default());
83 let stream = stream::iter(document.lines());
84 tokio::pin!(stream);
85
86 while let Some(item) = stream.next().await {
87 parser.process(item.into())
88 }
89 parser.finish()
90 }
91
92 #[cfg(feature = "spider")]
115 pub async fn parse_document(document: &str) -> Self {
116 let parser = driver::parse_document(Self::new_document(), Default::default());
117 parser.one(document)
118 }
119
120 #[cfg(not(feature = "tokio"))]
139 pub fn parse_document(document: &str) -> Self {
140 let parser = driver::parse_document(Self::new_document(), Default::default());
141 parser.one(document)
142 }
143
144 pub fn parse_fragment(fragment: &str) -> Self {
146 let parser = driver::parse_fragment(
147 Self::new_fragment(),
148 Default::default(),
149 QualName::new(None, ns!(html), local_name!("body")),
150 Vec::new(),
151 );
152 parser.one(fragment)
153 }
154
155 pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> {
157 Select {
158 inner: self.tree.nodes(),
159 selector,
160 }
161 }
162
163 pub fn root_element(&self) -> ElementRef {
165 let root_node = self
166 .tree
167 .root()
168 .children()
169 .find(|child| child.value().is_element())
170 .expect("html node missing");
171 ElementRef::wrap(root_node).unwrap()
172 }
173
174 pub fn html(&self) -> String {
176 let opts = SerializeOpts {
177 scripting_enabled: false, traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode,
179 create_missing_parent: false,
180 };
181 let mut buf = Vec::new();
182 serialize(&mut buf, self, opts).unwrap();
183 String::from_utf8(buf).unwrap()
184 }
185}
186
187#[derive(Debug)]
189pub struct Select<'a, 'b> {
190 inner: Nodes<'a, Node>,
191 selector: &'b Selector,
192}
193
194impl<'a, 'b> Iterator for Select<'a, 'b> {
195 type Item = ElementRef<'a>;
196
197 fn next(&mut self) -> Option<ElementRef<'a>> {
198 for node in self.inner.by_ref() {
199 if let Some(element) = ElementRef::wrap(node) {
200 if element.parent().is_some() && self.selector.matches(&element) {
201 return Some(element);
202 }
203 }
204 }
205 None
206 }
207
208 fn size_hint(&self) -> (usize, Option<usize>) {
209 let (_lower, upper) = self.inner.size_hint();
210
211 (0, upper)
212 }
213}
214
215impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
216 fn next_back(&mut self) -> Option<Self::Item> {
217 for node in self.inner.by_ref().rev() {
218 if let Some(element) = ElementRef::wrap(node) {
219 if element.parent().is_some() && self.selector.matches(&element) {
220 return Some(element);
221 }
222 }
223 }
224 None
225 }
226}
227
228mod serializable;
229mod tree_sink;
230
231#[cfg(test)]
232mod tests {
233 use super::Html;
234 use super::Selector;
235
236 #[test]
237 #[cfg(not(feature = "tokio"))]
238 fn root_element_fragment() {
239 let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
240 let root_ref = html.root_element();
241 let href = root_ref
242 .select(&Selector::parse("a").unwrap())
243 .next()
244 .unwrap();
245 assert_eq!(href.inner_html(), "1");
246 assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
247 }
248
249 #[tokio::test]
250 #[cfg(feature = "tokio")]
251 async fn root_element_fragment() {
252 let html = Html::parse_fragment(r#"<a href="http://github.com">1</a>"#);
253 let root_ref = html.root_element();
254 let href = root_ref
255 .select(&Selector::parse("a").unwrap())
256 .next()
257 .unwrap();
258 assert_eq!(href.inner_html(), "1");
259 assert_eq!(href.value().attr("href").unwrap(), "http://github.com");
260 }
261
262 #[test]
263 #[cfg(not(feature = "tokio"))]
264 fn root_element_document_doctype() {
265 let html = Html::parse_document("<!DOCTYPE html>\n<title>abc</title>");
266 let root_ref = html.root_element();
267 let title = root_ref
268 .select(&Selector::parse("title").unwrap())
269 .next()
270 .unwrap();
271 assert_eq!(title.inner_html(), "abc");
272 }
273
274 #[test]
275 #[cfg(not(feature = "tokio"))]
276 fn root_element_document_comment() {
277 let html = Html::parse_document("<!-- comment --><title>abc</title>");
278 let root_ref = html.root_element();
279 let title = root_ref
280 .select(&Selector::parse("title").unwrap())
281 .next()
282 .unwrap();
283 assert_eq!(title.inner_html(), "abc");
284 }
285
286 #[test]
287 #[cfg(not(feature = "tokio"))]
288 fn select_is_reversible() {
289 let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
290 let selector = Selector::parse("p").unwrap();
291 let result: Vec<_> = html
292 .select(&selector)
293 .rev()
294 .map(|e| e.inner_html())
295 .collect();
296 assert_eq!(result, vec!["element3", "element2", "element1"]);
297 }
298
299 #[test]
300 #[cfg(not(feature = "tokio"))]
301 fn select_has_a_size_hint() {
302 let html = Html::parse_document("<p>element1</p><p>element2</p><p>element3</p>");
303 let selector = Selector::parse("p").unwrap();
304 let (lower, upper) = html.select(&selector).size_hint();
305 assert_eq!(lower, 0);
306 assert_eq!(upper, Some(10));
307 }
308
309 #[cfg(feature = "atomic")]
310 #[cfg(not(feature = "tokio"))]
311 #[test]
312 fn html_is_send() {
313 fn send_sync<S: Send>() {}
314 send_sync::<Html>();
315 }
316}