spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10/// The type of selectors that can be used to query.
11#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13    /// CSS Selectors.
14    pub css: HashMap<K, Vec<Selector>>,
15    /// XPath Selectors.
16    pub xpath: HashMap<K, Vec<String>>,
17}
18
19#[cfg(feature = "transformations")]
20pub use spider_transformations;
21
22/// Extracted content from CSS query selectors.
23type CSSQueryMap = HashMap<String, Vec<String>>;
24
25lazy_static! {
26    /// Xpath factory.
27    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
28}
29
30/// Check if a selector is a valid xpath
31fn is_valid_xpath(expression: &str) -> bool {
32    match XPATH_FACTORY.build(expression) {
33        Ok(Some(_)) => true,
34        Ok(None) => false,
35        Err(_) => false,
36    }
37}
38
39/// Async stream CSS query selector map.
40pub async fn css_query_select_map_streamed<K>(
41    html: &str,
42    selectors: &DocumentSelectors<K>,
43) -> CSSQueryMap
44where
45    K: AsRef<str> + Eq + Hash + Sized,
46{
47    let mut map: CSSQueryMap = HashMap::new();
48
49    if !selectors.css.is_empty() {
50        let mut stream = tokio_stream::iter(&selectors.css);
51        let fragment = Box::new(Html::parse_document(html));
52
53        while let Some(selector) = stream.next().await {
54            for s in selector.1 {
55                for element in fragment.select(s) {
56                    process_selector::<K>(element, selector.0, &mut map);
57                }
58            }
59        }
60    }
61
62    if !selectors.xpath.is_empty() {
63        if let Ok(package) = parser::parse(html) {
64            let document = Box::new(package.as_document());
65
66            for selector in selectors.xpath.iter() {
67                for s in selector.1 {
68                    if let Ok(value) = evaluate_xpath(&document, s) {
69                        let text = value.into_string();
70
71                        if !text.is_empty() {
72                            match map.entry(selector.0.as_ref().to_string()) {
73                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
74                                Entry::Vacant(entry) => {
75                                    entry.insert(vec![text]);
76                                }
77                            }
78                        }
79                    };
80                }
81            }
82        };
83    }
84
85    for items in map.values_mut() {
86        items.dedup();
87    }
88
89    map
90}
91
92/// Sync CSS query selector map.
93pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
94where
95    K: AsRef<str> + Eq + Hash + Sized,
96{
97    let mut map: CSSQueryMap = HashMap::new();
98
99    if !selectors.css.is_empty() {
100        let fragment = Box::new(Html::parse_document(html));
101
102        for selector in selectors.css.iter() {
103            for s in selector.1 {
104                for element in fragment.select(s) {
105                    process_selector::<K>(element, selector.0, &mut map);
106                }
107            }
108        }
109    }
110
111    if !selectors.xpath.is_empty() {
112        if let Ok(package) = parser::parse(html) {
113            let document = package.as_document();
114
115            for selector in selectors.xpath.iter() {
116                for s in selector.1 {
117                    if let Ok(value) = evaluate_xpath(&document, s) {
118                        let text = value.into_string();
119
120                        if !text.is_empty() {
121                            match map.entry(selector.0.as_ref().to_string()) {
122                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
123                                Entry::Vacant(entry) => {
124                                    entry.insert(vec![text]);
125                                }
126                            }
127                        }
128                    };
129                }
130            }
131        };
132    }
133
134    map
135}
136
137/// Process a single element and update the map with the results.
138fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
139where
140    K: AsRef<str> + Eq + Hash + Sized,
141{
142    let name = name.as_ref();
143    let element_name = element.value().name();
144
145    let text = if element_name == "meta" {
146        element.attr("content").unwrap_or_default().into()
147    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
148        match element.attr(if element_name == "link" {
149            "href"
150        } else {
151            "src"
152        }) {
153            Some(href) => href.into(),
154            _ => clean_element_text(&element),
155        }
156    } else if element_name == "img" || element_name == "source" {
157        let mut img_text = String::new();
158
159        if let Some(src) = element.attr("src") {
160            if !src.is_empty() {
161                img_text.push('[');
162                img_text.push_str(src.trim());
163                img_text.push(']');
164            }
165        }
166        if let Some(alt) = element.attr("alt") {
167            if !alt.is_empty() {
168                if img_text.is_empty() {
169                    img_text.push_str(alt);
170                } else {
171                    img_text.push('(');
172                    img_text.push('"');
173                    img_text.push_str(alt);
174                    img_text.push('"');
175                    img_text.push(')');
176                }
177            }
178        }
179
180        img_text
181    } else {
182        clean_element_text(&element)
183    };
184
185    if !text.is_empty() {
186        match map.entry(name.to_string()) {
187            Entry::Occupied(mut entry) => entry.get_mut().push(text),
188            Entry::Vacant(entry) => {
189                entry.insert(vec![text]);
190            }
191        }
192    }
193}
194
195/// get the text extracted.
196pub fn clean_element_text(element: &ElementRef) -> String {
197    element.text().collect::<Vec<_>>().join(" ")
198}
199
200/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
201pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
202where
203    K: AsRef<str> + Eq + Hash + Clone + Debug,
204    V: AsRef<str> + Debug + AsRef<str>,
205    S: IntoIterator<Item = V>,
206{
207    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
208    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
209
210    for (key, selector_set) in selectors {
211        let mut selectors_vec = Vec::new();
212        let mut selectors_vec_xpath = Vec::new();
213
214        for selector_str in selector_set {
215            match Selector::parse(selector_str.as_ref()) {
216                Ok(selector) => selectors_vec.push(selector),
217                Err(err) => {
218                    if is_valid_xpath(selector_str.as_ref()) {
219                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
220                    } else {
221                        warn!(
222                            "{}",
223                            format!(
224                                "Failed to parse selector '{}': {:?}",
225                                selector_str.as_ref(),
226                                err
227                            ),
228                        )
229                    }
230                }
231            }
232        }
233
234        let has_css_selectors = !selectors_vec.is_empty();
235        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
236
237        if has_css_selectors && !has_xpath_selectors {
238            valid_selectors.insert(key, selectors_vec);
239        } else if !has_css_selectors && has_xpath_selectors {
240            valid_selectors_xpath.insert(key, selectors_vec_xpath);
241        } else {
242            if has_css_selectors {
243                valid_selectors.insert(key.clone(), selectors_vec);
244            }
245            if has_xpath_selectors {
246                valid_selectors_xpath.insert(key, selectors_vec_xpath);
247            }
248        }
249    }
250
251    DocumentSelectors {
252        css: valid_selectors,
253        xpath: valid_selectors_xpath,
254    }
255}
256
257/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
258#[cfg(not(feature = "indexset"))]
259pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
260where
261    K: AsRef<str> + Eq + Hash + Clone + Debug,
262    V: AsRef<str> + Debug + AsRef<str>,
263{
264    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
265}
266
267/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
268#[cfg(feature = "indexset")]
269pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
270where
271    K: AsRef<str> + Eq + Hash + Clone + Debug,
272    V: AsRef<str> + Debug + AsRef<str>,
273{
274    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
275}
276
277#[cfg(not(feature = "indexset"))]
278pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
279#[cfg(feature = "indexset")]
280pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
281#[cfg(not(feature = "indexset"))]
282pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
283#[cfg(feature = "indexset")]
284pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
285
286#[cfg(test)]
287#[tokio::test]
288async fn test_css_query_select_map_streamed() {
289    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
290
291    let data = css_query_select_map_streamed(
292        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
293        &build_selectors(map),
294    )
295    .await;
296
297    assert!(!data.is_empty(), "CSS extraction failed",);
298}
299
300#[test]
301fn test_css_query_select_map() {
302    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
303    let data = css_query_select_map(
304        r#"<html><body><ul class="list">Test</ul></body></html>"#,
305        &build_selectors(map),
306    );
307
308    assert!(!data.is_empty(), "CSS extraction failed",);
309}
310
311#[cfg(test)]
312#[tokio::test]
313async fn test_css_query_select_map_streamed_multi_join() {
314    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
315    let data = css_query_select_map_streamed(
316        r#"<html>
317            <body>
318                <ul class="list"><li>First</li></ul>
319                <ul class="sub-list"><li>Second</li></ul>
320            </body>
321        </html>"#,
322        &build_selectors(map),
323    )
324    .await;
325
326    assert!(!data.is_empty(), "CSS extraction failed");
327}
328
329#[cfg(test)]
330#[tokio::test]
331async fn test_xpath_query_select_map_streamed() {
332    let map = QueryCSSMap::from([(
333        "list",
334        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
335    )]);
336    let selectors = build_selectors(map);
337    let data = css_query_select_map_streamed(
338        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
339        &selectors,
340    )
341    .await;
342
343    assert!(!data.is_empty(), "Xpath extraction failed",);
344}