web_parser/document/
node.rs

1use crate::prelude::*;
2use super::Nodes;
3
4/// The HTML node
5#[derive(Debug, Clone)]
6pub struct Node<'a> {
7    element: scraper::ElementRef<'a>,
8    selector: scraper::Selector
9}
10
11impl<'a> Node<'a> {
12    /// Creates a new Node from scraper::ElementRef
13    pub(crate) fn new(element: scraper::ElementRef<'a>) -> Self {
14        Self {
15            element,
16            selector: scraper::Selector::parse("*").unwrap()
17        }
18    }
19
20    /// Select HTML node by CSS selector
21    pub fn select(&self, selector: &'static str) -> Result<Option<Node<'a>>> {
22        let sel = scraper::Selector::parse(selector).map_err(Error::from)?;
23        
24        let node = self.element
25            .select(&sel)
26            .next()
27            .map(Node::new);
28        
29        Ok(node)
30    }
31
32    /// Select HTML nodes by CSS selector
33    pub fn select_all(&mut self, selector: &'static str) -> Result<Option<Nodes>> {
34        self.selector = scraper::Selector::parse(selector).map_err(Error::from)?;
35        let mut nodes = self.element.select(&self.selector).peekable();
36
37        if nodes.peek().is_some() {
38            Ok(Some(Nodes::new(None, Some(nodes))))
39        } else {
40            Ok(None)
41        }
42    }
43
44    /// Returns a node parent
45    pub fn parent(&self) -> Option<Node<'a>> {
46        self.element.parent()
47            .and_then(|node| scraper::ElementRef::wrap(node))
48            .map(Node::new)
49    }
50
51    /// Returns a node attribute
52    pub fn attr(&self, name: &str) -> Option<&str> {
53        self.element.value().attr(name)
54    }
55
56    /// Returns a node text contents
57    pub fn text(&self) -> String {
58        self.element.text().collect()
59    }
60
61    /// Returns a node inner HTML
62    pub fn html(&self) -> String {
63        self.element.html()
64    }
65
66    /// Returns node content from node excluding tags from black list
67    pub fn filter_text(&self, black_list: &[&str]) -> String {
68        Self::filter_elem_text(self.element, black_list)
69            .split_whitespace()
70            .collect::<Vec<_>>()
71            .join(" ")
72    }
73
74    /// Returns node content from element excluding tags from black list
75    fn filter_elem_text(node: scraper::element_ref::ElementRef, black_list: &[&str]) -> String {
76        let tag_name = node.value().name();
77
78        // filtering by black list:
79        if black_list.contains(&tag_name) {
80            return String::new();
81        }
82
83        // collecting text:
84        let mut result = String::new();
85
86        for child in node.children() {
87            match child.value() {
88                scraper::node::Node::Text(text) => {
89                    result.push(' ');
90                    result.push_str(text);
91                }
92                scraper::node::Node::Element(_) => {
93                    if let Some(child_element) = scraper::ElementRef::wrap(child) {
94                        result.push(' ');
95                        result.push_str(&Self::filter_elem_text(child_element, black_list));
96                    }
97                }
98                _ => {}
99            }
100        }
101        result
102    }
103}