reqwest_scraper/
xpath.rs

1//!  Evalute the value in HTML response using xpath expression
2//!
3use crate::error::{Result, ScraperError};
4use itertools::Itertools;
5use libxml::{
6    tree::Document,
7    xpath::{Context, Object},
8};
9use std::collections::HashSet;
10
11/// Html Response
12#[derive(Clone)]
13pub struct XHtml {
14    doc: Document,
15}
16
17/// Wrap HTML document and compiled xpath
18pub struct XPathResult {
19    object: Object,
20}
21
22impl XHtml {
23    /// constructor
24    pub fn new<S: Into<String>>(html_str: S) -> Result<Self> {
25        let parser = libxml::parser::Parser::default_html();
26        let doc = parser.parse_string(html_str.into())?;
27        Ok(Self { doc })
28    }
29    /// Using xpath to extract results from html
30    pub fn select(&self, xpath: &str) -> Result<XPathResult> {
31        let context = Context::new(&self.doc)
32            .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
33        let object = context
34            .evaluate(xpath)
35            .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
36        Ok(XPathResult { object })
37    }
38}
39
40/// Html Node
41pub struct Node {
42    node: libxml::tree::node::Node,
43}
44
45impl XPathResult {
46    /// return multiple results
47    pub fn as_nodes(&self) -> Vec<Node> {
48        self.object
49            .get_nodes_as_vec()
50            .into_iter()
51            .map(Node::new)
52            .collect::<Vec<_>>()
53    }
54
55    /// return multiple results as string
56    pub fn as_strs(&self) -> Vec<String> {
57        self.object.get_nodes_as_str()
58    }
59
60    /// return first result
61    pub fn as_node(&self) -> Option<Node> {
62        self.object
63            .get_nodes_as_vec()
64            .first()
65            .map(|n| Node::new(n.to_owned()))
66    }
67
68    /// return first result as string
69    pub fn as_str(&self) -> Option<String> {
70        self.object
71            .get_nodes_as_str()
72            .first()
73            .map(ToOwned::to_owned)
74    }
75}
76
77impl Node {
78    /// constructor
79    pub fn new(node: libxml::tree::node::Node) -> Self {
80        Self { node }
81    }
82
83    /// Returns the element name.
84    pub fn name(&self) -> String {
85        self.node.get_name()
86    }
87
88    /// Returns the element ID.
89    pub fn id(&self) -> Option<String> {
90        self.node.get_attribute("id").map(|s| s.trim().into())
91    }
92
93    /// Returns the element class.
94    pub fn classes(&self) -> HashSet<String> {
95        self.node
96            .get_class_names()
97            .into_iter()
98            .filter(|c| !c.is_empty())
99            .collect()
100    }
101
102    /// Returns the value of an attribute.
103    pub fn attr(&self, attr: &str) -> Option<String> {
104        self.node.get_attribute(attr).map(|s| s.trim().into())
105    }
106
107    /// Check if the attribute exists
108    pub fn has_attr(&self, attr: &str) -> bool {
109        self.node.has_attribute(attr)
110    }
111
112    /// Returns the text of this element.
113    pub fn text(&self) -> String {
114        self.node.get_content().trim().into()
115    }
116
117    /// Returns the HTML of this element.
118    pub fn html(&self) -> String {
119        todo!()
120    }
121
122    /// Returns the inner HTML of this element.
123    pub fn inner_html(&self) -> String {
124        todo!()
125    }
126
127    /// Iterate over all child nodes which are elements
128    pub fn children(&self) -> Vec<Node> {
129        self.node
130            .get_child_elements()
131            .into_iter()
132            .map(Node::new)
133            .collect_vec()
134    }
135
136    /// Find nodes based on this node using a relative xpath
137    pub fn findnodes(&self, relative_xpath: &str) -> Result<Vec<Node>> {
138        Ok(self
139            .node
140            .findnodes(relative_xpath)
141            .map_err(|_| {
142                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
143            })?
144            .into_iter()
145            .map(Node::new)
146            .collect_vec())
147    }
148
149    /// Find values based on this node using a relative xpath
150    pub fn findvalues(&self, relative_xpath: &str) -> Result<Vec<String>> {
151        match self.node.findvalues(relative_xpath) {
152            Ok(vec) => Ok(vec.into_iter().map(|s| s.trim().to_string()).collect_vec()),
153            Err(_) => Err(ScraperError::XPathError(format!(
154                "relative xpath parse failed:{}",
155                relative_xpath
156            ))),
157        }
158    }
159
160    /// Find first node based on this node using a relative xpath
161    pub fn findnode(&self, relative_xpath: &str) -> Result<Option<Node>> {
162        Ok(self
163            .node
164            .findnodes(relative_xpath)
165            .map_err(|_| {
166                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
167            })?
168            .first()
169            .map(|node| Node::new(node.to_owned())))
170    }
171
172    /// Find first value based on this node using a relative xpath
173    pub fn findvalue(&self, relative_xpath: &str) -> Result<Option<String>> {
174        Ok(self
175            .node
176            .findvalues(relative_xpath)
177            .map_err(|_| {
178                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
179            })?
180            .first()
181            .map(|v| v.trim().into()))
182    }
183}
184
185#[cfg(test)]
186mod tests {
187
188    #[test]
189    fn test_select_xpath() {
190        use super::*;
191        let html_str = r#"
192        <html>
193            <body>
194                <div id="content">
195                    <p>Hello, World!</p>
196                    <p>This is a test.</p>
197                </div>
198            </body>
199        </html>
200        "#;
201
202        let xhtml = XHtml::new(html_str).expect("parse xhtml failed");
203
204        let content = xhtml.select("//div[@id='content']").ok();
205        assert!(content.is_some());
206
207        let content = content.unwrap().as_node();
208        assert!(content.is_some());
209
210        let content = content.unwrap();
211        assert_eq!(content.attr("id").unwrap(), "content");
212        let children = content.children();
213        assert_eq!(children.len(), 2);
214        assert_eq!(children[0].text(), "Hello, World!");
215        assert_eq!(children[1].text(), "This is a test.");
216
217        let p1 = content.findvalue("./p[position()=1]").unwrap();
218        assert!(p1.is_some());
219        assert_eq!(p1.unwrap(), "Hello, World!");
220    }
221}