1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
//!  Evalute the value in HTML response using xpath expression
//!
use std::collections::HashSet;

use itertools::Itertools;
use libxml::{
    tree::Document,
    xpath::{Context, Object},
};

use crate::error::{Result, ScraperError};

/// Html Response
pub struct XHtml {
    pub(crate) doc: Document,
}

/// Wrap HTML document and compiled xpath
pub struct XPathResult {
    object: Object,
}

impl XHtml {
    /// Using xpath to extract results from html
    pub fn select(&self, xpath: &str) -> Result<XPathResult> {
        let context = Context::new(&self.doc)
            .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
        let object = context
            .evaluate(xpath)
            .map_err(|_| ScraperError::XPathError(format!("xpath parse failed:{}", xpath)))?;
        Ok(XPathResult { object })
    }
}

/// Html Node
pub struct Node {
    node: libxml::tree::node::Node,
}

impl XPathResult {
    /// return multiple results
    pub fn as_nodes(&self) -> Vec<Node> {
        self.object
            .get_nodes_as_vec()
            .into_iter()
            .map(Node::new)
            .collect::<Vec<_>>()
    }

    /// return multiple results as string
    pub fn as_strs(&self) -> Vec<String> {
        self.object.get_nodes_as_str()
    }

    /// return first result
    pub fn as_node(&self) -> Option<Node> {
        self.object
            .get_nodes_as_vec()
            .first()
            .map(|n| Node::new(n.to_owned()))
    }

    /// return first result as string
    pub fn as_str(&self) -> Option<String> {
        self.object
            .get_nodes_as_str()
            .first()
            .map(ToOwned::to_owned)
    }
}

impl Node {
    /// constructor
    pub fn new(node: libxml::tree::node::Node) -> Self {
        Self { node }
    }

    /// Returns the element name.
    pub fn name(&self) -> String {
        self.node.get_name()
    }

    /// Returns the element ID.
    pub fn id(&self) -> Option<String> {
        self.node.get_attribute("id").map(|s| s.trim().into())
    }

    /// Returns the element class.
    pub fn classes(&self) -> HashSet<String> {
        self.node
            .get_class_names()
            .into_iter()
            .filter(|c| !c.is_empty())
            .collect()
    }

    /// Returns the value of an attribute.
    pub fn attr(&self, attr: &str) -> Option<String> {
        self.node.get_attribute(attr).map(|s| s.trim().into())
    }

    /// Check if the attribute exists
    pub fn has_attr(&self, attr: &str) -> bool {
        self.node.has_attribute(attr)
    }

    /// Returns the text of this element.
    pub fn text(&self) -> String {
        self.node.get_content().trim().into()
    }

    /// Returns the HTML of this element.
    pub fn html(&self) -> String {
        todo!()
    }

    /// Returns the inner HTML of this element.
    pub fn inner_html(&self) -> String {
        todo!()
    }

    /// Iterate over all child nodes which are elements
    pub fn children(&self) -> Vec<Node> {
        self.node
            .get_child_elements()
            .into_iter()
            .map(Node::new)
            .collect_vec()
    }

    /// Find nodes based on this node using a relative xpath
    pub fn findnodes(&self, relative_xpath: &str) -> Result<Vec<Node>> {
        Ok(self
            .node
            .findnodes(relative_xpath)
            .map_err(|_| {
                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
            })?
            .into_iter()
            .map(Node::new)
            .collect_vec())
    }

    /// Find values based on this node using a relative xpath
    pub fn findvalues(&self, relative_xpath: &str) -> Result<Vec<String>> {
        match self.node.findvalues(relative_xpath) {
            Ok(vec) => Ok(vec.into_iter().map(|s| s.trim().to_string()).collect_vec()),
            Err(_) => Err(ScraperError::XPathError(format!(
                "relative xpath parse failed:{}",
                relative_xpath
            ))),
        }
    }

    /// Find first node based on this node using a relative xpath
    pub fn findnode(&self, relative_xpath: &str) -> Result<Option<Node>> {
        Ok(self
            .node
            .findnodes(relative_xpath)
            .map_err(|_| {
                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
            })?
            .first()
            .map(|node| Node::new(node.to_owned())))
    }

    /// Find first value based on this node using a relative xpath
    pub fn findvalue(&self, relative_xpath: &str) -> Result<Option<String>> {
        Ok(self
            .node
            .findvalues(relative_xpath)
            .map_err(|_| {
                ScraperError::XPathError(format!("relative xpath parse failed:{}", relative_xpath))
            })?
            .first()
            .map(|v| v.trim().into()))
    }
}