Skip to main content

spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10/// The type of selectors that can be used to query.
11#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13    /// CSS Selectors.
14    pub css: HashMap<K, Vec<Selector>>,
15    /// XPath Selectors.
16    pub xpath: HashMap<K, Vec<String>>,
17}
18
19/// Extracted content from CSS query selectors.
20type CSSQueryMap = HashMap<String, Vec<String>>;
21
22lazy_static! {
23    /// Xpath factory.
24    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
25}
26
27/// Check if a selector is a valid xpath
28fn is_valid_xpath(expression: &str) -> bool {
29    match XPATH_FACTORY.build(expression) {
30        Ok(Some(_)) => true,
31        Ok(None) => false,
32        Err(_) => false,
33    }
34}
35
36/// Async stream CSS query selector map.
37pub async fn css_query_select_map_streamed<K>(
38    html: &str,
39    selectors: &DocumentSelectors<K>,
40) -> CSSQueryMap
41where
42    K: AsRef<str> + Eq + Hash + Sized,
43{
44    let mut map: CSSQueryMap = HashMap::new();
45
46    if !selectors.css.is_empty() {
47        let mut stream = tokio_stream::iter(&selectors.css);
48        let fragment = Box::new(Html::parse_document(html));
49
50        while let Some(selector) = stream.next().await {
51            for s in selector.1 {
52                for element in fragment.select(s) {
53                    process_selector::<K>(element, selector.0, &mut map);
54                }
55            }
56        }
57    }
58
59    if !selectors.xpath.is_empty() {
60        if let Ok(package) = parser::parse(html) {
61            let document = Box::new(package.as_document());
62
63            for selector in selectors.xpath.iter() {
64                for s in selector.1 {
65                    if let Ok(value) = evaluate_xpath(&document, s) {
66                        let text = value.into_string();
67
68                        if !text.is_empty() {
69                            match map.entry(selector.0.as_ref().to_string()) {
70                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
71                                Entry::Vacant(entry) => {
72                                    entry.insert(vec![text]);
73                                }
74                            }
75                        }
76                    };
77                }
78            }
79        };
80    }
81
82    for items in map.values_mut() {
83        items.dedup();
84    }
85
86    map
87}
88
89/// Sync CSS query selector map.
90pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
91where
92    K: AsRef<str> + Eq + Hash + Sized,
93{
94    let mut map: CSSQueryMap = HashMap::new();
95
96    if !selectors.css.is_empty() {
97        let fragment = Box::new(Html::parse_document(html));
98
99        for selector in selectors.css.iter() {
100            for s in selector.1 {
101                for element in fragment.select(s) {
102                    process_selector::<K>(element, selector.0, &mut map);
103                }
104            }
105        }
106    }
107
108    if !selectors.xpath.is_empty() {
109        if let Ok(package) = parser::parse(html) {
110            let document = package.as_document();
111
112            for selector in selectors.xpath.iter() {
113                for s in selector.1 {
114                    if let Ok(value) = evaluate_xpath(&document, s) {
115                        let text = value.into_string();
116
117                        if !text.is_empty() {
118                            match map.entry(selector.0.as_ref().to_string()) {
119                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
120                                Entry::Vacant(entry) => {
121                                    entry.insert(vec![text]);
122                                }
123                            }
124                        }
125                    };
126                }
127            }
128        };
129    }
130
131    map
132}
133
134/// Process a single element and update the map with the results.
135fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
136where
137    K: AsRef<str> + Eq + Hash + Sized,
138{
139    let name = name.as_ref();
140    let element_name = element.value().name();
141
142    let text = if element_name == "meta" {
143        element.attr("content").unwrap_or_default().into()
144    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
145        match element.attr(if element_name == "link" {
146            "href"
147        } else {
148            "src"
149        }) {
150            Some(href) => href.into(),
151            _ => clean_element_text(&element),
152        }
153    } else if element_name == "img" || element_name == "source" {
154        let mut img_text = String::new();
155
156        if let Some(src) = element.attr("src") {
157            if !src.is_empty() {
158                img_text.push('[');
159                img_text.push_str(src.trim());
160                img_text.push(']');
161            }
162        }
163        if let Some(alt) = element.attr("alt") {
164            if !alt.is_empty() {
165                if img_text.is_empty() {
166                    img_text.push_str(alt);
167                } else {
168                    img_text.push('(');
169                    img_text.push('"');
170                    img_text.push_str(alt);
171                    img_text.push('"');
172                    img_text.push(')');
173                }
174            }
175        }
176
177        img_text
178    } else {
179        clean_element_text(&element)
180    };
181
182    if !text.is_empty() {
183        match map.entry(name.to_string()) {
184            Entry::Occupied(mut entry) => entry.get_mut().push(text),
185            Entry::Vacant(entry) => {
186                entry.insert(vec![text]);
187            }
188        }
189    }
190}
191
192/// get the text extracted.
193pub fn clean_element_text(element: &ElementRef) -> String {
194    element.text().collect::<Vec<_>>().join(" ")
195}
196
197/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
198pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
199where
200    K: AsRef<str> + Eq + Hash + Clone + Debug,
201    V: AsRef<str> + Debug + AsRef<str>,
202    S: IntoIterator<Item = V>,
203{
204    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
205    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
206
207    for (key, selector_set) in selectors {
208        let mut selectors_vec = Vec::new();
209        let mut selectors_vec_xpath = Vec::new();
210
211        for selector_str in selector_set {
212            match Selector::parse(selector_str.as_ref()) {
213                Ok(selector) => selectors_vec.push(selector),
214                Err(err) => {
215                    if is_valid_xpath(selector_str.as_ref()) {
216                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
217                    } else {
218                        warn!(
219                            "Failed to parse selector '{}': {:?}",
220                            selector_str.as_ref(),
221                            err
222                        )
223                    }
224                }
225            }
226        }
227
228        let has_css_selectors = !selectors_vec.is_empty();
229        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
230
231        if has_css_selectors && !has_xpath_selectors {
232            valid_selectors.insert(key, selectors_vec);
233        } else if !has_css_selectors && has_xpath_selectors {
234            valid_selectors_xpath.insert(key, selectors_vec_xpath);
235        } else {
236            if has_css_selectors {
237                valid_selectors.insert(key.clone(), selectors_vec);
238            }
239            if has_xpath_selectors {
240                valid_selectors_xpath.insert(key, selectors_vec_xpath);
241            }
242        }
243    }
244
245    DocumentSelectors {
246        css: valid_selectors,
247        xpath: valid_selectors_xpath,
248    }
249}
250
251/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
252#[cfg(not(feature = "indexset"))]
253pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
254where
255    K: AsRef<str> + Eq + Hash + Clone + Debug,
256    V: AsRef<str> + Debug + AsRef<str>,
257{
258    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
259}
260
261/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
262#[cfg(feature = "indexset")]
263pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
264where
265    K: AsRef<str> + Eq + Hash + Clone + Debug,
266    V: AsRef<str> + Debug + AsRef<str>,
267{
268    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
269}
270
271#[cfg(not(feature = "indexset"))]
272pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
273#[cfg(feature = "indexset")]
274pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
275#[cfg(not(feature = "indexset"))]
276pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
277#[cfg(feature = "indexset")]
278pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
279
280#[cfg(test)]
281#[tokio::test]
282async fn test_css_query_select_map_streamed() {
283    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
284
285    let data = css_query_select_map_streamed(
286        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
287        &build_selectors(map),
288    )
289    .await;
290
291    assert!(!data.is_empty(), "CSS extraction failed",);
292}
293
294#[test]
295fn test_css_query_select_map() {
296    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
297    let data = css_query_select_map(
298        r#"<html><body><ul class="list">Test</ul></body></html>"#,
299        &build_selectors(map),
300    );
301
302    assert!(!data.is_empty(), "CSS extraction failed",);
303}
304
305#[cfg(test)]
306#[tokio::test]
307async fn test_css_query_select_map_streamed_multi_join() {
308    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
309    let data = css_query_select_map_streamed(
310        r#"<html>
311            <body>
312                <ul class="list"><li>First</li></ul>
313                <ul class="sub-list"><li>Second</li></ul>
314            </body>
315        </html>"#,
316        &build_selectors(map),
317    )
318    .await;
319
320    assert!(!data.is_empty(), "CSS extraction failed");
321}
322
323#[cfg(test)]
324#[tokio::test]
325async fn test_xpath_query_select_map_streamed() {
326    let map = QueryCSSMap::from([(
327        "list",
328        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
329    )]);
330    let selectors = build_selectors(map);
331    let data = css_query_select_map_streamed(
332        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
333        &selectors,
334    )
335    .await;
336
337    assert!(!data.is_empty(), "Xpath extraction failed",);
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn test_css_query_empty_html() {
346        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
347        let data = css_query_select_map("", &build_selectors(map));
348        assert!(data.is_empty());
349    }
350
351    #[test]
352    fn test_css_query_no_matches() {
353        let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
354        let data = css_query_select_map(
355            r#"<html><body><p class="other">Hello</p></body></html>"#,
356            &build_selectors(map),
357        );
358        assert!(data.is_empty());
359    }
360
361    #[test]
362    fn test_build_selectors_invalid_css() {
363        let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
364        let selectors = build_selectors(map);
365        // Invalid CSS should be rejected (not panic)
366        assert!(selectors.css.is_empty());
367    }
368
369    #[test]
370    fn test_build_selectors_mixed_css_xpath() {
371        let map = QueryCSSMap::from([(
372            "mixed",
373            QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
374        )]);
375        let selectors = build_selectors(map);
376        // Should have CSS selectors and/or XPath selectors
377        let has_css = selectors.css.contains_key("mixed");
378        let has_xpath = selectors.xpath.contains_key("mixed");
379        assert!(has_css || has_xpath);
380    }
381
382    #[test]
383    fn test_css_query_special_characters() {
384        let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
385        let data = css_query_select_map(
386            r#"<html><body><p>Hello &amp; "world" &lt;test&gt;</p></body></html>"#,
387            &build_selectors(map),
388        );
389        assert!(!data.is_empty());
390        let values = data.get("content").unwrap();
391        assert!(!values.is_empty());
392    }
393
394    #[test]
395    fn test_clean_element_text_basic() {
396        let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
397        let selector = Selector::parse("p").unwrap();
398        if let Some(element) = html.select(&selector).next() {
399            let text = clean_element_text(&element);
400            assert!(text.contains("Hello"));
401            assert!(text.contains("World"));
402        }
403    }
404
405    #[test]
406    fn test_process_selector_img_element() {
407        let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
408        let selector = Selector::parse("img").unwrap();
409        let mut map: HashMap<String, Vec<String>> = HashMap::new();
410
411        if let Some(element) = html.select(&selector).next() {
412            process_selector::<&str>(element, &"image", &mut map);
413        }
414        assert!(map.contains_key("image"));
415        let vals = &map["image"];
416        assert!(!vals.is_empty());
417        // Should contain src in brackets and alt in parens
418        assert!(vals[0].contains("photo.jpg"));
419    }
420
421    #[test]
422    fn test_process_selector_meta_element() {
423        let html = Html::parse_document(
424            r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
425        );
426        let selector = Selector::parse("meta[name='description']").unwrap();
427        let mut map: HashMap<String, Vec<String>> = HashMap::new();
428
429        if let Some(element) = html.select(&selector).next() {
430            process_selector::<&str>(element, &"desc", &mut map);
431        }
432        assert!(map.contains_key("desc"));
433        assert_eq!(map["desc"][0], "Test description");
434    }
435}