Skip to main content

spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8
9/// The type of selectors that can be used to query.
10#[derive(Default, Debug, Clone)]
11pub struct DocumentSelectors<K> {
12    /// CSS Selectors.
13    pub css: HashMap<K, Vec<Selector>>,
14    /// XPath Selectors.
15    pub xpath: HashMap<K, Vec<String>>,
16}
17
18/// Extracted content from CSS query selectors.
19type CSSQueryMap = HashMap<String, Vec<String>>;
20
21lazy_static! {
22    /// Xpath factory.
23    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
24}
25
26/// Check if a selector is a valid xpath
27fn is_valid_xpath(expression: &str) -> bool {
28    match XPATH_FACTORY.build(expression) {
29        Ok(Some(_)) => true,
30        Ok(None) => false,
31        Err(_) => false,
32    }
33}
34
35/// Async stream CSS query selector map.
36pub async fn css_query_select_map_streamed<K>(
37    html: &str,
38    selectors: &DocumentSelectors<K>,
39) -> CSSQueryMap
40where
41    K: AsRef<str> + Eq + Hash + Sized,
42{
43    let mut map: CSSQueryMap = HashMap::with_capacity(selectors.css.len() + selectors.xpath.len());
44
45    if !selectors.css.is_empty() {
46        let fragment = Box::new(Html::parse_document(html));
47
48        for selector in &selectors.css {
49            for s in selector.1 {
50                for element in fragment.select(s) {
51                    process_selector::<K>(element, selector.0, &mut map);
52                }
53            }
54        }
55    }
56
57    if !selectors.xpath.is_empty() {
58        if let Ok(package) = parser::parse(html) {
59            let document = Box::new(package.as_document());
60
61            for selector in selectors.xpath.iter() {
62                for s in selector.1 {
63                    if let Ok(value) = evaluate_xpath(&document, s) {
64                        let text = value.into_string();
65
66                        if !text.is_empty() {
67                            match map.entry(selector.0.as_ref().to_string()) {
68                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
69                                Entry::Vacant(entry) => {
70                                    entry.insert(vec![text]);
71                                }
72                            }
73                        }
74                    };
75                }
76            }
77        };
78    }
79
80    for items in map.values_mut() {
81        items.dedup();
82    }
83
84    map
85}
86
87/// Sync CSS query selector map.
88pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
89where
90    K: AsRef<str> + Eq + Hash + Sized,
91{
92    let mut map: CSSQueryMap = HashMap::with_capacity(selectors.css.len() + selectors.xpath.len());
93
94    if !selectors.css.is_empty() {
95        let fragment = Box::new(Html::parse_document(html));
96
97        for selector in selectors.css.iter() {
98            for s in selector.1 {
99                for element in fragment.select(s) {
100                    process_selector::<K>(element, selector.0, &mut map);
101                }
102            }
103        }
104    }
105
106    if !selectors.xpath.is_empty() {
107        if let Ok(package) = parser::parse(html) {
108            let document = package.as_document();
109
110            for selector in selectors.xpath.iter() {
111                for s in selector.1 {
112                    if let Ok(value) = evaluate_xpath(&document, s) {
113                        let text = value.into_string();
114
115                        if !text.is_empty() {
116                            match map.entry(selector.0.as_ref().to_string()) {
117                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
118                                Entry::Vacant(entry) => {
119                                    entry.insert(vec![text]);
120                                }
121                            }
122                        }
123                    };
124                }
125            }
126        };
127    }
128
129    map
130}
131
132/// Process a single element and update the map with the results.
133fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
134where
135    K: AsRef<str> + Eq + Hash + Sized,
136{
137    let name = name.as_ref();
138    let element_name = element.value().name();
139
140    let text = if element_name == "meta" {
141        element.attr("content").unwrap_or_default().into()
142    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
143        match element.attr(if element_name == "link" {
144            "href"
145        } else {
146            "src"
147        }) {
148            Some(href) => href.into(),
149            _ => clean_element_text(&element),
150        }
151    } else if element_name == "img" || element_name == "source" {
152        let mut img_text = String::new();
153
154        if let Some(src) = element.attr("src") {
155            if !src.is_empty() {
156                img_text.push('[');
157                img_text.push_str(src.trim());
158                img_text.push(']');
159            }
160        }
161        if let Some(alt) = element.attr("alt") {
162            if !alt.is_empty() {
163                if img_text.is_empty() {
164                    img_text.push_str(alt);
165                } else {
166                    img_text.push('(');
167                    img_text.push('"');
168                    img_text.push_str(alt);
169                    img_text.push('"');
170                    img_text.push(')');
171                }
172            }
173        }
174
175        img_text
176    } else {
177        clean_element_text(&element)
178    };
179
180    if !text.is_empty() {
181        match map.entry(name.to_string()) {
182            Entry::Occupied(mut entry) => entry.get_mut().push(text),
183            Entry::Vacant(entry) => {
184                entry.insert(vec![text]);
185            }
186        }
187    }
188}
189
190/// get the text extracted.
191pub fn clean_element_text(element: &ElementRef) -> String {
192    element.text().collect::<Vec<_>>().join(" ")
193}
194
195/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
196pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
197where
198    K: AsRef<str> + Eq + Hash + Clone + Debug,
199    V: AsRef<str> + Debug + AsRef<str>,
200    S: IntoIterator<Item = V>,
201{
202    let cap = selectors.len();
203    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::with_capacity(cap);
204    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::with_capacity(cap);
205
206    for (key, selector_set) in selectors {
207        let iter = selector_set.into_iter();
208        let (size_hint, _) = iter.size_hint();
209        let mut selectors_vec = Vec::with_capacity(size_hint);
210        let mut selectors_vec_xpath = Vec::new();
211
212        for selector_str in iter {
213            match Selector::parse(selector_str.as_ref()) {
214                Ok(selector) => selectors_vec.push(selector),
215                Err(err) => {
216                    if is_valid_xpath(selector_str.as_ref()) {
217                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
218                    } else {
219                        warn!(
220                            "Failed to parse selector '{}': {:?}",
221                            selector_str.as_ref(),
222                            err
223                        )
224                    }
225                }
226            }
227        }
228
229        let has_css_selectors = !selectors_vec.is_empty();
230        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
231
232        if has_css_selectors && !has_xpath_selectors {
233            valid_selectors.insert(key, selectors_vec);
234        } else if !has_css_selectors && has_xpath_selectors {
235            valid_selectors_xpath.insert(key, selectors_vec_xpath);
236        } else {
237            if has_css_selectors {
238                valid_selectors.insert(key.clone(), selectors_vec);
239            }
240            if has_xpath_selectors {
241                valid_selectors_xpath.insert(key, selectors_vec_xpath);
242            }
243        }
244    }
245
246    DocumentSelectors {
247        css: valid_selectors,
248        xpath: valid_selectors_xpath,
249    }
250}
251
252/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
253#[cfg(not(feature = "indexset"))]
254pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
255where
256    K: AsRef<str> + Eq + Hash + Clone + Debug,
257    V: AsRef<str> + Debug + AsRef<str>,
258{
259    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
260}
261
262/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
263#[cfg(feature = "indexset")]
264pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
265where
266    K: AsRef<str> + Eq + Hash + Clone + Debug,
267    V: AsRef<str> + Debug + AsRef<str>,
268{
269    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
270}
271
272#[cfg(not(feature = "indexset"))]
273pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
274#[cfg(feature = "indexset")]
275pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
276#[cfg(not(feature = "indexset"))]
277pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
278#[cfg(feature = "indexset")]
279pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
280
281#[cfg(test)]
282#[tokio::test]
283async fn test_css_query_select_map_streamed() {
284    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
285
286    let data = css_query_select_map_streamed(
287        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
288        &build_selectors(map),
289    )
290    .await;
291
292    assert!(!data.is_empty(), "CSS extraction failed",);
293}
294
295#[test]
296fn test_css_query_select_map() {
297    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
298    let data = css_query_select_map(
299        r#"<html><body><ul class="list">Test</ul></body></html>"#,
300        &build_selectors(map),
301    );
302
303    assert!(!data.is_empty(), "CSS extraction failed",);
304}
305
306#[cfg(test)]
307#[tokio::test]
308async fn test_css_query_select_map_streamed_multi_join() {
309    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
310    let data = css_query_select_map_streamed(
311        r#"<html>
312            <body>
313                <ul class="list"><li>First</li></ul>
314                <ul class="sub-list"><li>Second</li></ul>
315            </body>
316        </html>"#,
317        &build_selectors(map),
318    )
319    .await;
320
321    assert!(!data.is_empty(), "CSS extraction failed");
322}
323
324#[cfg(test)]
325#[tokio::test]
326async fn test_xpath_query_select_map_streamed() {
327    let map = QueryCSSMap::from([(
328        "list",
329        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
330    )]);
331    let selectors = build_selectors(map);
332    let data = css_query_select_map_streamed(
333        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
334        &selectors,
335    )
336    .await;
337
338    assert!(!data.is_empty(), "Xpath extraction failed",);
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    #[test]
346    fn test_css_query_empty_html() {
347        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
348        let data = css_query_select_map("", &build_selectors(map));
349        assert!(data.is_empty());
350    }
351
352    #[test]
353    fn test_css_query_no_matches() {
354        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
355        let data = css_query_select_map(
356            r#"<html><body><p class="other">Hello</p></body></html>"#,
357            &build_selectors(map),
358        );
359        assert!(data.is_empty());
360    }
361
362    #[test]
363    fn test_build_selectors_invalid_css() {
364        let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
365        let selectors = build_selectors(map);
366        // Invalid CSS should be rejected (not panic)
367        assert!(selectors.css.is_empty());
368    }
369
370    #[test]
371    fn test_build_selectors_mixed_css_xpath() {
372        let map = QueryCSSMap::from([(
373            "mixed",
374            QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
375        )]);
376        let selectors = build_selectors(map);
377        // Should have CSS selectors and/or XPath selectors
378        let has_css = selectors.css.contains_key("mixed");
379        let has_xpath = selectors.xpath.contains_key("mixed");
380        assert!(has_css || has_xpath);
381    }
382
383    #[test]
384    fn test_css_query_special_characters() {
385        let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
386        let data = css_query_select_map(
387            r#"<html><body><p>Hello &amp; "world" &lt;test&gt;</p></body></html>"#,
388            &build_selectors(map),
389        );
390        assert!(!data.is_empty());
391        let values = data.get("content").unwrap();
392        assert!(!values.is_empty());
393    }
394
395    #[test]
396    fn test_clean_element_text_basic() {
397        let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
398        let selector = Selector::parse("p").unwrap();
399        if let Some(element) = html.select(&selector).next() {
400            let text = clean_element_text(&element);
401            assert!(text.contains("Hello"));
402            assert!(text.contains("World"));
403        }
404    }
405
406    #[test]
407    fn test_process_selector_img_element() {
408        let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
409        let selector = Selector::parse("img").unwrap();
410        let mut map: HashMap<String, Vec<String>> = HashMap::new();
411
412        if let Some(element) = html.select(&selector).next() {
413            process_selector::<&str>(element, &"image", &mut map);
414        }
415        assert!(map.contains_key("image"));
416        let vals = &map["image"];
417        assert!(!vals.is_empty());
418        // Should contain src in brackets and alt in parens
419        assert!(vals[0].contains("photo.jpg"));
420    }
421
422    #[test]
423    fn test_process_selector_meta_element() {
424        let html = Html::parse_document(
425            r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
426        );
427        let selector = Selector::parse("meta[name='description']").unwrap();
428        let mut map: HashMap<String, Vec<String>> = HashMap::new();
429
430        if let Some(element) = html.select(&selector).next() {
431            process_selector::<&str>(element, &"desc", &mut map);
432        }
433        assert!(map.contains_key("desc"));
434        assert_eq!(map["desc"][0], "Test description");
435    }
436}