Skip to main content

spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8
9/// The type of selectors that can be used to query.
10#[derive(Default, Debug, Clone)]
11pub struct DocumentSelectors<K> {
12    /// CSS Selectors.
13    pub css: HashMap<K, Vec<Selector>>,
14    /// XPath Selectors.
15    pub xpath: HashMap<K, Vec<String>>,
16}
17
18/// Extracted content from CSS query selectors.
19type CSSQueryMap = HashMap<String, Vec<String>>;
20
21lazy_static! {
22    /// Xpath factory.
23    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
24}
25
26/// Check if a selector is a valid xpath
27fn is_valid_xpath(expression: &str) -> bool {
28    match XPATH_FACTORY.build(expression) {
29        Ok(Some(_)) => true,
30        Ok(None) => false,
31        Err(_) => false,
32    }
33}
34
35/// Async stream CSS query selector map.
36pub async fn css_query_select_map_streamed<K>(
37    html: &str,
38    selectors: &DocumentSelectors<K>,
39) -> CSSQueryMap
40where
41    K: AsRef<str> + Eq + Hash + Sized,
42{
43    let mut map: CSSQueryMap = HashMap::new();
44
45    if !selectors.css.is_empty() {
46        let fragment = Box::new(Html::parse_document(html));
47
48        for selector in &selectors.css {
49            for s in selector.1 {
50                for element in fragment.select(s) {
51                    process_selector::<K>(element, selector.0, &mut map);
52                }
53            }
54        }
55    }
56
57    if !selectors.xpath.is_empty() {
58        if let Ok(package) = parser::parse(html) {
59            let document = Box::new(package.as_document());
60
61            for selector in selectors.xpath.iter() {
62                for s in selector.1 {
63                    if let Ok(value) = evaluate_xpath(&document, s) {
64                        let text = value.into_string();
65
66                        if !text.is_empty() {
67                            match map.entry(selector.0.as_ref().to_string()) {
68                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
69                                Entry::Vacant(entry) => {
70                                    entry.insert(vec![text]);
71                                }
72                            }
73                        }
74                    };
75                }
76            }
77        };
78    }
79
80    for items in map.values_mut() {
81        items.dedup();
82    }
83
84    map
85}
86
87/// Sync CSS query selector map.
88pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
89where
90    K: AsRef<str> + Eq + Hash + Sized,
91{
92    let mut map: CSSQueryMap = HashMap::new();
93
94    if !selectors.css.is_empty() {
95        let fragment = Box::new(Html::parse_document(html));
96
97        for selector in selectors.css.iter() {
98            for s in selector.1 {
99                for element in fragment.select(s) {
100                    process_selector::<K>(element, selector.0, &mut map);
101                }
102            }
103        }
104    }
105
106    if !selectors.xpath.is_empty() {
107        if let Ok(package) = parser::parse(html) {
108            let document = package.as_document();
109
110            for selector in selectors.xpath.iter() {
111                for s in selector.1 {
112                    if let Ok(value) = evaluate_xpath(&document, s) {
113                        let text = value.into_string();
114
115                        if !text.is_empty() {
116                            match map.entry(selector.0.as_ref().to_string()) {
117                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
118                                Entry::Vacant(entry) => {
119                                    entry.insert(vec![text]);
120                                }
121                            }
122                        }
123                    };
124                }
125            }
126        };
127    }
128
129    map
130}
131
132/// Process a single element and update the map with the results.
133fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
134where
135    K: AsRef<str> + Eq + Hash + Sized,
136{
137    let name = name.as_ref();
138    let element_name = element.value().name();
139
140    let text = if element_name == "meta" {
141        element.attr("content").unwrap_or_default().into()
142    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
143        match element.attr(if element_name == "link" {
144            "href"
145        } else {
146            "src"
147        }) {
148            Some(href) => href.into(),
149            _ => clean_element_text(&element),
150        }
151    } else if element_name == "img" || element_name == "source" {
152        let mut img_text = String::new();
153
154        if let Some(src) = element.attr("src") {
155            if !src.is_empty() {
156                img_text.push('[');
157                img_text.push_str(src.trim());
158                img_text.push(']');
159            }
160        }
161        if let Some(alt) = element.attr("alt") {
162            if !alt.is_empty() {
163                if img_text.is_empty() {
164                    img_text.push_str(alt);
165                } else {
166                    img_text.push('(');
167                    img_text.push('"');
168                    img_text.push_str(alt);
169                    img_text.push('"');
170                    img_text.push(')');
171                }
172            }
173        }
174
175        img_text
176    } else {
177        clean_element_text(&element)
178    };
179
180    if !text.is_empty() {
181        match map.entry(name.to_string()) {
182            Entry::Occupied(mut entry) => entry.get_mut().push(text),
183            Entry::Vacant(entry) => {
184                entry.insert(vec![text]);
185            }
186        }
187    }
188}
189
190/// get the text extracted.
191pub fn clean_element_text(element: &ElementRef) -> String {
192    element.text().collect::<Vec<_>>().join(" ")
193}
194
195/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
196pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
197where
198    K: AsRef<str> + Eq + Hash + Clone + Debug,
199    V: AsRef<str> + Debug + AsRef<str>,
200    S: IntoIterator<Item = V>,
201{
202    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
203    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
204
205    for (key, selector_set) in selectors {
206        let mut selectors_vec = Vec::new();
207        let mut selectors_vec_xpath = Vec::new();
208
209        for selector_str in selector_set {
210            match Selector::parse(selector_str.as_ref()) {
211                Ok(selector) => selectors_vec.push(selector),
212                Err(err) => {
213                    if is_valid_xpath(selector_str.as_ref()) {
214                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
215                    } else {
216                        warn!(
217                            "Failed to parse selector '{}': {:?}",
218                            selector_str.as_ref(),
219                            err
220                        )
221                    }
222                }
223            }
224        }
225
226        let has_css_selectors = !selectors_vec.is_empty();
227        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
228
229        if has_css_selectors && !has_xpath_selectors {
230            valid_selectors.insert(key, selectors_vec);
231        } else if !has_css_selectors && has_xpath_selectors {
232            valid_selectors_xpath.insert(key, selectors_vec_xpath);
233        } else {
234            if has_css_selectors {
235                valid_selectors.insert(key.clone(), selectors_vec);
236            }
237            if has_xpath_selectors {
238                valid_selectors_xpath.insert(key, selectors_vec_xpath);
239            }
240        }
241    }
242
243    DocumentSelectors {
244        css: valid_selectors,
245        xpath: valid_selectors_xpath,
246    }
247}
248
249/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
250#[cfg(not(feature = "indexset"))]
251pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
252where
253    K: AsRef<str> + Eq + Hash + Clone + Debug,
254    V: AsRef<str> + Debug + AsRef<str>,
255{
256    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
257}
258
259/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
260#[cfg(feature = "indexset")]
261pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
262where
263    K: AsRef<str> + Eq + Hash + Clone + Debug,
264    V: AsRef<str> + Debug + AsRef<str>,
265{
266    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
267}
268
269#[cfg(not(feature = "indexset"))]
270pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
271#[cfg(feature = "indexset")]
272pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
273#[cfg(not(feature = "indexset"))]
274pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
275#[cfg(feature = "indexset")]
276pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
277
278#[cfg(test)]
279#[tokio::test]
280async fn test_css_query_select_map_streamed() {
281    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
282
283    let data = css_query_select_map_streamed(
284        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
285        &build_selectors(map),
286    )
287    .await;
288
289    assert!(!data.is_empty(), "CSS extraction failed",);
290}
291
292#[test]
293fn test_css_query_select_map() {
294    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
295    let data = css_query_select_map(
296        r#"<html><body><ul class="list">Test</ul></body></html>"#,
297        &build_selectors(map),
298    );
299
300    assert!(!data.is_empty(), "CSS extraction failed",);
301}
302
303#[cfg(test)]
304#[tokio::test]
305async fn test_css_query_select_map_streamed_multi_join() {
306    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
307    let data = css_query_select_map_streamed(
308        r#"<html>
309            <body>
310                <ul class="list"><li>First</li></ul>
311                <ul class="sub-list"><li>Second</li></ul>
312            </body>
313        </html>"#,
314        &build_selectors(map),
315    )
316    .await;
317
318    assert!(!data.is_empty(), "CSS extraction failed");
319}
320
321#[cfg(test)]
322#[tokio::test]
323async fn test_xpath_query_select_map_streamed() {
324    let map = QueryCSSMap::from([(
325        "list",
326        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
327    )]);
328    let selectors = build_selectors(map);
329    let data = css_query_select_map_streamed(
330        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
331        &selectors,
332    )
333    .await;
334
335    assert!(!data.is_empty(), "Xpath extraction failed",);
336}
337
338#[cfg(test)]
339mod tests {
340    use super::*;
341
342    #[test]
343    fn test_css_query_empty_html() {
344        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
345        let data = css_query_select_map("", &build_selectors(map));
346        assert!(data.is_empty());
347    }
348
349    #[test]
350    fn test_css_query_no_matches() {
351        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
352        let data = css_query_select_map(
353            r#"<html><body><p class="other">Hello</p></body></html>"#,
354            &build_selectors(map),
355        );
356        assert!(data.is_empty());
357    }
358
359    #[test]
360    fn test_build_selectors_invalid_css() {
361        let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
362        let selectors = build_selectors(map);
363        // Invalid CSS should be rejected (not panic)
364        assert!(selectors.css.is_empty());
365    }
366
367    #[test]
368    fn test_build_selectors_mixed_css_xpath() {
369        let map = QueryCSSMap::from([(
370            "mixed",
371            QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
372        )]);
373        let selectors = build_selectors(map);
374        // Should have CSS selectors and/or XPath selectors
375        let has_css = selectors.css.contains_key("mixed");
376        let has_xpath = selectors.xpath.contains_key("mixed");
377        assert!(has_css || has_xpath);
378    }
379
380    #[test]
381    fn test_css_query_special_characters() {
382        let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
383        let data = css_query_select_map(
384            r#"<html><body><p>Hello &amp; "world" &lt;test&gt;</p></body></html>"#,
385            &build_selectors(map),
386        );
387        assert!(!data.is_empty());
388        let values = data.get("content").unwrap();
389        assert!(!values.is_empty());
390    }
391
392    #[test]
393    fn test_clean_element_text_basic() {
394        let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
395        let selector = Selector::parse("p").unwrap();
396        if let Some(element) = html.select(&selector).next() {
397            let text = clean_element_text(&element);
398            assert!(text.contains("Hello"));
399            assert!(text.contains("World"));
400        }
401    }
402
403    #[test]
404    fn test_process_selector_img_element() {
405        let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
406        let selector = Selector::parse("img").unwrap();
407        let mut map: HashMap<String, Vec<String>> = HashMap::new();
408
409        if let Some(element) = html.select(&selector).next() {
410            process_selector::<&str>(element, &"image", &mut map);
411        }
412        assert!(map.contains_key("image"));
413        let vals = &map["image"];
414        assert!(!vals.is_empty());
415        // Should contain src in brackets and alt in parens
416        assert!(vals[0].contains("photo.jpg"));
417    }
418
419    #[test]
420    fn test_process_selector_meta_element() {
421        let html = Html::parse_document(
422            r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
423        );
424        let selector = Selector::parse("meta[name='description']").unwrap();
425        let mut map: HashMap<String, Vec<String>> = HashMap::new();
426
427        if let Some(element) = html.select(&selector).next() {
428            process_selector::<&str>(element, &"desc", &mut map);
429        }
430        assert!(map.contains_key("desc"));
431        assert_eq!(map["desc"][0], "Test description");
432    }
433}