spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10/// The type of selectors that can be used to query.
11#[derive(Default, Debug, Clone)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub struct DocumentSelectors<K> {
14    /// CSS Selectors.
15    pub css: HashMap<K, Vec<Selector>>,
16    /// XPath Selectors.
17    pub xpath: HashMap<K, Vec<String>>,
18}
19
20#[cfg(feature = "transformations")]
21pub use spider_transformations;
22
23/// Extracted content from CSS query selectors.
24type CSSQueryMap = HashMap<String, Vec<String>>;
25
26lazy_static! {
27    /// Xpath factory.
28    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
29}
30
31/// Check if a selector is a valid xpath
32fn is_valid_xpath(expression: &str) -> bool {
33    match XPATH_FACTORY.build(expression) {
34        Ok(Some(_)) => true,
35        Ok(None) => false,
36        Err(_) => false,
37    }
38}
39
40/// Async stream CSS query selector map.
41pub async fn css_query_select_map_streamed<K>(
42    html: &str,
43    selectors: &DocumentSelectors<K>,
44) -> CSSQueryMap
45where
46    K: AsRef<str> + Eq + Hash + Sized,
47{
48    let mut map: CSSQueryMap = HashMap::new();
49
50    if !selectors.css.is_empty() {
51        let mut stream = tokio_stream::iter(&selectors.css);
52        let fragment = Box::new(Html::parse_document(html));
53
54        while let Some(selector) = stream.next().await {
55            for s in selector.1 {
56                for element in fragment.select(s) {
57                    process_selector::<K>(element, selector.0, &mut map);
58                }
59            }
60        }
61    }
62
63    if !selectors.xpath.is_empty() {
64        if let Ok(package) = parser::parse(html) {
65            let document = Box::new(package.as_document());
66
67            for selector in selectors.xpath.iter() {
68                for s in selector.1 {
69                    if let Ok(value) = evaluate_xpath(&document, s) {
70                        let text = value.into_string();
71
72                        if !text.is_empty() {
73                            match map.entry(selector.0.as_ref().to_string()) {
74                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
75                                Entry::Vacant(entry) => {
76                                    entry.insert(vec![text]);
77                                }
78                            }
79                        }
80                    };
81                }
82            }
83        };
84    }
85
86    for items in map.values_mut() {
87        items.dedup();
88    }
89
90    map
91}
92
93/// Sync CSS query selector map.
94pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
95where
96    K: AsRef<str> + Eq + Hash + Sized,
97{
98    let mut map: CSSQueryMap = HashMap::new();
99
100    if !selectors.css.is_empty() {
101        let fragment = Box::new(Html::parse_document(html));
102
103        for selector in selectors.css.iter() {
104            for s in selector.1 {
105                for element in fragment.select(s) {
106                    process_selector::<K>(element, selector.0, &mut map);
107                }
108            }
109        }
110    }
111
112    if !selectors.xpath.is_empty() {
113        if let Ok(package) = parser::parse(html) {
114            let document = package.as_document();
115
116            for selector in selectors.xpath.iter() {
117                for s in selector.1 {
118                    if let Ok(value) = evaluate_xpath(&document, s) {
119                        let text = value.into_string();
120
121                        if !text.is_empty() {
122                            match map.entry(selector.0.as_ref().to_string()) {
123                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
124                                Entry::Vacant(entry) => {
125                                    entry.insert(vec![text]);
126                                }
127                            }
128                        }
129                    };
130                }
131            }
132        };
133    }
134
135    map
136}
137
138/// Process a single element and update the map with the results.
139fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
140where
141    K: AsRef<str> + Eq + Hash + Sized,
142{
143    let name = name.as_ref();
144    let element_name = element.value().name();
145
146    let text = if element_name == "meta" {
147        element.attr("content").unwrap_or_default().into()
148    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
149        match element.attr(if element_name == "link" {
150            "href"
151        } else {
152            "src"
153        }) {
154            Some(href) => href.into(),
155            _ => clean_element_text(&element),
156        }
157    } else if element_name == "img" || element_name == "source" {
158        let mut img_text = String::new();
159
160        if let Some(src) = element.attr("src") {
161            if !src.is_empty() {
162                img_text.push('[');
163                img_text.push_str(src.trim());
164                img_text.push(']');
165            }
166        }
167        if let Some(alt) = element.attr("alt") {
168            if !alt.is_empty() {
169                if img_text.is_empty() {
170                    img_text.push_str(alt);
171                } else {
172                    img_text.push('(');
173                    img_text.push('"');
174                    img_text.push_str(alt);
175                    img_text.push('"');
176                    img_text.push(')');
177                }
178            }
179        }
180
181        img_text
182    } else {
183        clean_element_text(&element)
184    };
185
186    if !text.is_empty() {
187        match map.entry(name.to_string()) {
188            Entry::Occupied(mut entry) => entry.get_mut().push(text),
189            Entry::Vacant(entry) => {
190                entry.insert(vec![text]);
191            }
192        }
193    }
194}
195
196/// get the text extracted.
197pub fn clean_element_text(element: &ElementRef) -> String {
198    element.text().collect::<Vec<_>>().join(" ")
199}
200
201/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
202pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
203where
204    K: AsRef<str> + Eq + Hash + Clone + Debug,
205    V: AsRef<str> + Debug + AsRef<str>,
206    S: IntoIterator<Item = V>,
207{
208    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
209    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
210
211    for (key, selector_set) in selectors {
212        let mut selectors_vec = Vec::new();
213        let mut selectors_vec_xpath = Vec::new();
214
215        for selector_str in selector_set {
216            match Selector::parse(selector_str.as_ref()) {
217                Ok(selector) => selectors_vec.push(selector),
218                Err(err) => {
219                    if is_valid_xpath(selector_str.as_ref()) {
220                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
221                    } else {
222                        warn!(
223                            "{}",
224                            format!(
225                                "Failed to parse selector '{}': {:?}",
226                                selector_str.as_ref(),
227                                err
228                            ),
229                        )
230                    }
231                }
232            }
233        }
234
235        let has_css_selectors = !selectors_vec.is_empty();
236        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
237
238        if has_css_selectors && !has_xpath_selectors {
239            valid_selectors.insert(key, selectors_vec);
240        } else if !has_css_selectors && has_xpath_selectors {
241            valid_selectors_xpath.insert(key, selectors_vec_xpath);
242        } else {
243            if has_css_selectors {
244                valid_selectors.insert(key.clone(), selectors_vec);
245            }
246            if has_xpath_selectors {
247                valid_selectors_xpath.insert(key, selectors_vec_xpath);
248            }
249        }
250    }
251
252    DocumentSelectors {
253        css: valid_selectors,
254        xpath: valid_selectors_xpath,
255    }
256}
257
258/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
259#[cfg(not(feature = "indexset"))]
260pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
261where
262    K: AsRef<str> + Eq + Hash + Clone + Debug,
263    V: AsRef<str> + Debug + AsRef<str>,
264{
265    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
266}
267
268/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
269#[cfg(feature = "indexset")]
270pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
271where
272    K: AsRef<str> + Eq + Hash + Clone + Debug,
273    V: AsRef<str> + Debug + AsRef<str>,
274{
275    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
276}
277
278#[cfg(not(feature = "indexset"))]
279pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
280#[cfg(feature = "indexset")]
281pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
282#[cfg(not(feature = "indexset"))]
283pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
284#[cfg(feature = "indexset")]
285pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
286
287#[cfg(test)]
288#[tokio::test]
289async fn test_css_query_select_map_streamed() {
290    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
291
292    let data = css_query_select_map_streamed(
293        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
294        &build_selectors(map),
295    )
296    .await;
297
298    assert!(!data.is_empty(), "CSS extraction failed",);
299}
300
301#[test]
302fn test_css_query_select_map() {
303    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
304    let data = css_query_select_map(
305        r#"<html><body><ul class="list">Test</ul></body></html>"#,
306        &build_selectors(map),
307    );
308
309    assert!(!data.is_empty(), "CSS extraction failed",);
310}
311
312#[cfg(test)]
313#[tokio::test]
314async fn test_css_query_select_map_streamed_multi_join() {
315    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
316    let data = css_query_select_map_streamed(
317        r#"<html>
318            <body>
319                <ul class="list"><li>First</li></ul>
320                <ul class="sub-list"><li>Second</li></ul>
321            </body>
322        </html>"#,
323        &build_selectors(map),
324    )
325    .await;
326
327    assert!(!data.is_empty(), "CSS extraction failed");
328}
329
330#[cfg(test)]
331#[tokio::test]
332async fn test_xpath_query_select_map_streamed() {
333    let map = QueryCSSMap::from([(
334        "list",
335        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
336    )]);
337    let selectors = build_selectors(map);
338    let data = css_query_select_map_streamed(
339        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
340        &selectors,
341    )
342    .await;
343
344    assert!(!data.is_empty(), "Xpath extraction failed",);
345}