1use crate::error::{Result, ScraperError};
4use itertools::Itertools;
5use libxml::{
6 tree::Document,
7 xpath::{Context, Object},
8};
9use std::collections::HashSet;
10
11#[derive(Clone)]
13pub struct XHtml {
14 doc: Document,
15}
16
17pub struct XPathResult {
19 object: Object,
20}
21
22impl XHtml {
23 pub fn new<S: Into<String>>(html_str: S) -> Result<Self> {
25 let parser = libxml::parser::Parser::default_html();
26 let doc = parser.parse_string(html_str.into())?;
27 Ok(Self { doc })
28 }
29 pub fn select(&self, xpath: &str) -> Result<XPathResult> {
31 let context = Context::new(&self.doc)
32 .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
33 let object = context
34 .evaluate(xpath)
35 .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
36 Ok(XPathResult { object })
37 }
38}
39
40pub struct Node {
42 node: libxml::tree::node::Node,
43}
44
45impl XPathResult {
46 pub fn as_nodes(&self) -> Vec<Node> {
48 self.object
49 .get_nodes_as_vec()
50 .into_iter()
51 .map(Node::new)
52 .collect::<Vec<_>>()
53 }
54
55 pub fn as_strs(&self) -> Vec<String> {
57 self.object.get_nodes_as_str()
58 }
59
60 pub fn as_node(&self) -> Option<Node> {
62 self.object
63 .get_nodes_as_vec()
64 .first()
65 .map(|n| Node::new(n.to_owned()))
66 }
67
68 pub fn as_str(&self) -> Option<String> {
70 self.object
71 .get_nodes_as_str()
72 .first()
73 .map(ToOwned::to_owned)
74 }
75}
76
77impl Node {
78 pub fn new(node: libxml::tree::node::Node) -> Self {
80 Self { node }
81 }
82
83 pub fn name(&self) -> String {
85 self.node.get_name()
86 }
87
88 pub fn id(&self) -> Option<String> {
90 self.node.get_attribute("id").map(|s| s.trim().into())
91 }
92
93 pub fn classes(&self) -> HashSet<String> {
95 self.node
96 .get_class_names()
97 .into_iter()
98 .filter(|c| !c.is_empty())
99 .collect()
100 }
101
102 pub fn attr(&self, attr: &str) -> Option<String> {
104 self.node.get_attribute(attr).map(|s| s.trim().into())
105 }
106
107 pub fn has_attr(&self, attr: &str) -> bool {
109 self.node.has_attribute(attr)
110 }
111
112 pub fn text(&self) -> String {
114 self.node.get_content().trim().into()
115 }
116
117 pub fn html(&self) -> String {
119 todo!()
120 }
121
122 pub fn inner_html(&self) -> String {
124 todo!()
125 }
126
127 pub fn children(&self) -> Vec<Node> {
129 self.node
130 .get_child_elements()
131 .into_iter()
132 .map(Node::new)
133 .collect_vec()
134 }
135
136 pub fn findnodes(&self, relative_xpath: &str) -> Result<Vec<Node>> {
138 Ok(self
139 .node
140 .findnodes(relative_xpath)
141 .map_err(|_| {
142 ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
143 })?
144 .into_iter()
145 .map(Node::new)
146 .collect_vec())
147 }
148
149 pub fn findvalues(&self, relative_xpath: &str) -> Result<Vec<String>> {
151 match self.node.findvalues(relative_xpath) {
152 Ok(vec) => Ok(vec.into_iter().map(|s| s.trim().to_string()).collect_vec()),
153 Err(_) => Err(ScraperError::XPathError(format!(
154 "relative xpath parse failed:{}",
155 relative_xpath
156 ))),
157 }
158 }
159
160 pub fn findnode(&self, relative_xpath: &str) -> Result<Option<Node>> {
162 Ok(self
163 .node
164 .findnodes(relative_xpath)
165 .map_err(|_| {
166 ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
167 })?
168 .first()
169 .map(|node| Node::new(node.to_owned())))
170 }
171
172 pub fn findvalue(&self, relative_xpath: &str) -> Result<Option<String>> {
174 Ok(self
175 .node
176 .findvalues(relative_xpath)
177 .map_err(|_| {
178 ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
179 })?
180 .first()
181 .map(|v| v.trim().into()))
182 }
183}
184
185#[cfg(test)]
186mod tests {
187
188 #[test]
189 fn test_select_xpath() {
190 use super::*;
191 let html_str = r#"
192 <html>
193 <body>
194 <div id="content">
195 <p>Hello, World!</p>
196 <p>This is a test.</p>
197 </div>
198 </body>
199 </html>
200 "#;
201
202 let xhtml = XHtml::new(html_str).expect("parse xhtml failed");
203
204 let content = xhtml.select("//div[@id='content']").ok();
205 assert!(content.is_some());
206
207 let content = content.unwrap().as_node();
208 assert!(content.is_some());
209
210 let content = content.unwrap();
211 assert_eq!(content.attr("id").unwrap(), "content");
212 let children = content.children();
213 assert_eq!(children.len(), 2);
214 assert_eq!(children[0].text(), "Hello, World!");
215 assert_eq!(children[1].text(), "This is a test.");
216
217 let p1 = content.findvalue("./p[position()=1]").unwrap();
218 assert!(p1.is_some());
219 assert_eq!(p1.unwrap(), "Hello, World!");
220 }
221}