spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10/// The type of selectors that can be used to query.
11#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13    /// CSS Selectors.
14    pub css: HashMap<K, Vec<Selector>>,
15    /// XPath Selectors.
16    pub xpath: HashMap<K, Vec<String>>,
17}
18
19/// Extracted content from CSS query selectors.
20type CSSQueryMap = HashMap<String, Vec<String>>;
21
22lazy_static! {
23    /// Xpath factory.
24    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
25}
26
27/// Check if a selector is a valid xpath
28fn is_valid_xpath(expression: &str) -> bool {
29    match XPATH_FACTORY.build(expression) {
30        Ok(Some(_)) => true,
31        Ok(None) => false,
32        Err(_) => false,
33    }
34}
35
36/// Async stream CSS query selector map.
37pub async fn css_query_select_map_streamed<K>(
38    html: &str,
39    selectors: &DocumentSelectors<K>,
40) -> CSSQueryMap
41where
42    K: AsRef<str> + Eq + Hash + Sized,
43{
44    let mut map: CSSQueryMap = HashMap::new();
45
46    if !selectors.css.is_empty() {
47        let mut stream = tokio_stream::iter(&selectors.css);
48        let fragment = Box::new(Html::parse_document(html));
49
50        while let Some(selector) = stream.next().await {
51            for s in selector.1 {
52                for element in fragment.select(s) {
53                    process_selector::<K>(element, selector.0, &mut map);
54                }
55            }
56        }
57    }
58
59    if !selectors.xpath.is_empty() {
60        if let Ok(package) = parser::parse(html) {
61            let document = Box::new(package.as_document());
62
63            for selector in selectors.xpath.iter() {
64                for s in selector.1 {
65                    if let Ok(value) = evaluate_xpath(&document, s) {
66                        let text = value.into_string();
67
68                        if !text.is_empty() {
69                            match map.entry(selector.0.as_ref().to_string()) {
70                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
71                                Entry::Vacant(entry) => {
72                                    entry.insert(vec![text]);
73                                }
74                            }
75                        }
76                    };
77                }
78            }
79        };
80    }
81
82    for items in map.values_mut() {
83        items.dedup();
84    }
85
86    map
87}
88
89/// Sync CSS query selector map.
90pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
91where
92    K: AsRef<str> + Eq + Hash + Sized,
93{
94    let mut map: CSSQueryMap = HashMap::new();
95
96    if !selectors.css.is_empty() {
97        let fragment = Box::new(Html::parse_document(html));
98
99        for selector in selectors.css.iter() {
100            for s in selector.1 {
101                for element in fragment.select(s) {
102                    process_selector::<K>(element, selector.0, &mut map);
103                }
104            }
105        }
106    }
107
108    if !selectors.xpath.is_empty() {
109        if let Ok(package) = parser::parse(html) {
110            let document = package.as_document();
111
112            for selector in selectors.xpath.iter() {
113                for s in selector.1 {
114                    if let Ok(value) = evaluate_xpath(&document, s) {
115                        let text = value.into_string();
116
117                        if !text.is_empty() {
118                            match map.entry(selector.0.as_ref().to_string()) {
119                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
120                                Entry::Vacant(entry) => {
121                                    entry.insert(vec![text]);
122                                }
123                            }
124                        }
125                    };
126                }
127            }
128        };
129    }
130
131    map
132}
133
134/// Process a single element and update the map with the results.
135fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
136where
137    K: AsRef<str> + Eq + Hash + Sized,
138{
139    let name = name.as_ref();
140    let element_name = element.value().name();
141
142    let text = if element_name == "meta" {
143        element.attr("content").unwrap_or_default().into()
144    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
145        match element.attr(if element_name == "link" {
146            "href"
147        } else {
148            "src"
149        }) {
150            Some(href) => href.into(),
151            _ => clean_element_text(&element),
152        }
153    } else if element_name == "img" || element_name == "source" {
154        let mut img_text = String::new();
155
156        if let Some(src) = element.attr("src") {
157            if !src.is_empty() {
158                img_text.push('[');
159                img_text.push_str(src.trim());
160                img_text.push(']');
161            }
162        }
163        if let Some(alt) = element.attr("alt") {
164            if !alt.is_empty() {
165                if img_text.is_empty() {
166                    img_text.push_str(alt);
167                } else {
168                    img_text.push('(');
169                    img_text.push('"');
170                    img_text.push_str(alt);
171                    img_text.push('"');
172                    img_text.push(')');
173                }
174            }
175        }
176
177        img_text
178    } else {
179        clean_element_text(&element)
180    };
181
182    if !text.is_empty() {
183        match map.entry(name.to_string()) {
184            Entry::Occupied(mut entry) => entry.get_mut().push(text),
185            Entry::Vacant(entry) => {
186                entry.insert(vec![text]);
187            }
188        }
189    }
190}
191
192/// get the text extracted.
193pub fn clean_element_text(element: &ElementRef) -> String {
194    element.text().collect::<Vec<_>>().join(" ")
195}
196
197/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
198pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
199where
200    K: AsRef<str> + Eq + Hash + Clone + Debug,
201    V: AsRef<str> + Debug + AsRef<str>,
202    S: IntoIterator<Item = V>,
203{
204    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
205    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
206
207    for (key, selector_set) in selectors {
208        let mut selectors_vec = Vec::new();
209        let mut selectors_vec_xpath = Vec::new();
210
211        for selector_str in selector_set {
212            match Selector::parse(selector_str.as_ref()) {
213                Ok(selector) => selectors_vec.push(selector),
214                Err(err) => {
215                    if is_valid_xpath(selector_str.as_ref()) {
216                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
217                    } else {
218                        warn!(
219                            "{}",
220                            format!(
221                                "Failed to parse selector '{}': {:?}",
222                                selector_str.as_ref(),
223                                err
224                            ),
225                        )
226                    }
227                }
228            }
229        }
230
231        let has_css_selectors = !selectors_vec.is_empty();
232        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
233
234        if has_css_selectors && !has_xpath_selectors {
235            valid_selectors.insert(key, selectors_vec);
236        } else if !has_css_selectors && has_xpath_selectors {
237            valid_selectors_xpath.insert(key, selectors_vec_xpath);
238        } else {
239            if has_css_selectors {
240                valid_selectors.insert(key.clone(), selectors_vec);
241            }
242            if has_xpath_selectors {
243                valid_selectors_xpath.insert(key, selectors_vec_xpath);
244            }
245        }
246    }
247
248    DocumentSelectors {
249        css: valid_selectors,
250        xpath: valid_selectors_xpath,
251    }
252}
253
254/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
255#[cfg(not(feature = "indexset"))]
256pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
257where
258    K: AsRef<str> + Eq + Hash + Clone + Debug,
259    V: AsRef<str> + Debug + AsRef<str>,
260{
261    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
262}
263
264/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
265#[cfg(feature = "indexset")]
266pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
267where
268    K: AsRef<str> + Eq + Hash + Clone + Debug,
269    V: AsRef<str> + Debug + AsRef<str>,
270{
271    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
272}
273
274#[cfg(not(feature = "indexset"))]
275pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
276#[cfg(feature = "indexset")]
277pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
278#[cfg(not(feature = "indexset"))]
279pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
280#[cfg(feature = "indexset")]
281pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
282
283#[cfg(test)]
284#[tokio::test]
285async fn test_css_query_select_map_streamed() {
286    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
287
288    let data = css_query_select_map_streamed(
289        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
290        &build_selectors(map),
291    )
292    .await;
293
294    assert!(!data.is_empty(), "CSS extraction failed",);
295}
296
297#[test]
298fn test_css_query_select_map() {
299    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
300    let data = css_query_select_map(
301        r#"<html><body><ul class="list">Test</ul></body></html>"#,
302        &build_selectors(map),
303    );
304
305    assert!(!data.is_empty(), "CSS extraction failed",);
306}
307
308#[cfg(test)]
309#[tokio::test]
310async fn test_css_query_select_map_streamed_multi_join() {
311    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
312    let data = css_query_select_map_streamed(
313        r#"<html>
314            <body>
315                <ul class="list"><li>First</li></ul>
316                <ul class="sub-list"><li>Second</li></ul>
317            </body>
318        </html>"#,
319        &build_selectors(map),
320    )
321    .await;
322
323    assert!(!data.is_empty(), "CSS extraction failed");
324}
325
326#[cfg(test)]
327#[tokio::test]
328async fn test_xpath_query_select_map_streamed() {
329    let map = QueryCSSMap::from([(
330        "list",
331        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
332    )]);
333    let selectors = build_selectors(map);
334    let data = css_query_select_map_streamed(
335        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
336        &selectors,
337    )
338    .await;
339
340    assert!(!data.is_empty(), "Xpath extraction failed",);
341}