spider_utils/
lib.rs

1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10/// The type of selectors that can be used to query.
11#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13    /// CSS Selectors.
14    pub css: HashMap<K, Vec<Selector>>,
15    /// XPath Selectors.
16    pub xpath: HashMap<K, Vec<String>>,
17}
18
19#[cfg(feature = "transformations")]
20pub use spider_transformations;
21
22/// Extracted content from CSS query selectors.
23type CSSQueryMap = HashMap<String, Vec<String>>;
24
25lazy_static! {
26    static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
27}
28
29/// Check if a selector is a valid xpath
30fn is_valid_xpath(expression: &str) -> bool {
31    match XPATH_FACTORY.build(expression) {
32        Ok(Some(_)) => true,
33        Ok(None) => false,
34        Err(_) => false,
35    }
36}
37
38/// Async stream CSS query selector map.
39pub async fn css_query_select_map_streamed<K>(
40    html: &str,
41    selectors: &DocumentSelectors<K>,
42) -> CSSQueryMap
43where
44    K: AsRef<str> + Eq + Hash + Sized,
45{
46    let mut map: CSSQueryMap = HashMap::new();
47
48    if !selectors.css.is_empty() {
49        let mut stream = tokio_stream::iter(&selectors.css);
50        let fragment = Box::new(Html::parse_document(html));
51
52        while let Some(selector) = stream.next().await {
53            for s in selector.1 {
54                for element in fragment.select(s) {
55                    process_selector::<K>(element, selector.0, &mut map);
56                }
57            }
58        }
59    }
60
61    if !selectors.xpath.is_empty() {
62        if let Ok(package) = parser::parse(html) {
63            let document = Box::new(package.as_document());
64
65            for selector in selectors.xpath.iter() {
66                for s in selector.1 {
67                    if let Ok(value) = evaluate_xpath(&document, s) {
68                        let text = value.into_string();
69
70                        if !text.is_empty() {
71                            match map.entry(selector.0.as_ref().to_string()) {
72                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
73                                Entry::Vacant(entry) => {
74                                    entry.insert(vec![text]);
75                                }
76                            }
77                        }
78                    };
79                }
80            }
81        };
82    }
83
84    for items in map.values_mut() {
85        items.dedup();
86    }
87
88    map
89}
90
91/// Sync CSS query selector map.
92pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
93where
94    K: AsRef<str> + Eq + Hash + Sized,
95{
96    let mut map: CSSQueryMap = HashMap::new();
97
98    if !selectors.css.is_empty() {
99        let fragment = Box::new(Html::parse_document(html));
100
101        for selector in selectors.css.iter() {
102            for s in selector.1 {
103                for element in fragment.select(s) {
104                    process_selector::<K>(element, selector.0, &mut map);
105                }
106            }
107        }
108    }
109
110    if !selectors.xpath.is_empty() {
111        if let Ok(package) = parser::parse(html) {
112            let document = package.as_document();
113
114            for selector in selectors.xpath.iter() {
115                for s in selector.1 {
116                    if let Ok(value) = evaluate_xpath(&document, s) {
117                        let text = value.into_string();
118
119                        if !text.is_empty() {
120                            match map.entry(selector.0.as_ref().to_string()) {
121                                Entry::Occupied(mut entry) => entry.get_mut().push(text),
122                                Entry::Vacant(entry) => {
123                                    entry.insert(vec![text]);
124                                }
125                            }
126                        }
127                    };
128                }
129            }
130        };
131    }
132
133    map
134}
135
136/// Process a single element and update the map with the results.
137fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
138where
139    K: AsRef<str> + Eq + Hash + Sized,
140{
141    let name = name.as_ref();
142    let element_name = element.value().name();
143
144    let text = if element_name == "meta" {
145        element.attr("content").unwrap_or_default().into()
146    } else if element_name == "link" || element_name == "script" || element_name == "styles" {
147        match element.attr(if element_name == "link" {
148            "href"
149        } else {
150            "src"
151        }) {
152            Some(href) => href.into(),
153            _ => clean_element_text(&element),
154        }
155    } else if element_name == "img" || element_name == "source" {
156        let mut img_text = String::new();
157
158        if let Some(src) = element.attr("src") {
159            if !src.is_empty() {
160                img_text.push('[');
161                img_text.push_str(src.trim());
162                img_text.push(']');
163            }
164        }
165        if let Some(alt) = element.attr("alt") {
166            if !alt.is_empty() {
167                if img_text.is_empty() {
168                    img_text.push_str(alt);
169                } else {
170                    img_text.push('(');
171                    img_text.push('"');
172                    img_text.push_str(alt);
173                    img_text.push('"');
174                    img_text.push(')');
175                }
176            }
177        }
178
179        img_text
180    } else {
181        clean_element_text(&element)
182    };
183
184    if !text.is_empty() {
185        match map.entry(name.to_string()) {
186            Entry::Occupied(mut entry) => entry.get_mut().push(text),
187            Entry::Vacant(entry) => {
188                entry.insert(vec![text]);
189            }
190        }
191    }
192}
193
194/// get the text extracted.
195pub fn clean_element_text(element: &ElementRef) -> String {
196    element.text().collect::<Vec<_>>().join(" ")
197}
198
199/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
200pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
201where
202    K: AsRef<str> + Eq + Hash + Clone + Debug,
203    V: AsRef<str> + Debug + AsRef<str>,
204    S: IntoIterator<Item = V>,
205{
206    let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
207    let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
208
209    for (key, selector_set) in selectors {
210        let mut selectors_vec = Vec::new();
211        let mut selectors_vec_xpath = Vec::new();
212
213        for selector_str in selector_set {
214            match Selector::parse(selector_str.as_ref()) {
215                Ok(selector) => selectors_vec.push(selector),
216                Err(err) => {
217                    if is_valid_xpath(selector_str.as_ref()) {
218                        selectors_vec_xpath.push(selector_str.as_ref().to_string())
219                    } else {
220                        warn!(
221                            "{}",
222                            format!(
223                                "Failed to parse selector '{}': {:?}",
224                                selector_str.as_ref(),
225                                err
226                            ),
227                        )
228                    }
229                }
230            }
231        }
232
233        let has_css_selectors = !selectors_vec.is_empty();
234        let has_xpath_selectors = !selectors_vec_xpath.is_empty();
235
236        if has_css_selectors && !has_xpath_selectors {
237            valid_selectors.insert(key, selectors_vec);
238        } else if !has_css_selectors && has_xpath_selectors {
239            valid_selectors_xpath.insert(key, selectors_vec_xpath);
240        } else {
241            if has_css_selectors {
242                valid_selectors.insert(key.clone(), selectors_vec);
243            }
244            if has_xpath_selectors {
245                valid_selectors_xpath.insert(key, selectors_vec_xpath);
246            }
247        }
248    }
249
250    DocumentSelectors {
251        css: valid_selectors,
252        xpath: valid_selectors_xpath,
253    }
254}
255
256/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
257#[cfg(not(feature = "indexset"))]
258pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
259where
260    K: AsRef<str> + Eq + Hash + Clone + Debug,
261    V: AsRef<str> + Debug + AsRef<str>,
262{
263    build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
264}
265
266/// Build valid css selectors for extracting. The hashmap takes items with the key for the object key and the value is the css selector.
267#[cfg(feature = "indexset")]
268pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
269where
270    K: AsRef<str> + Eq + Hash + Clone + Debug,
271    V: AsRef<str> + Debug + AsRef<str>,
272{
273    build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
274}
275
276#[cfg(not(feature = "indexset"))]
277pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
278#[cfg(feature = "indexset")]
279pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
280#[cfg(not(feature = "indexset"))]
281pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
282#[cfg(feature = "indexset")]
283pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
284
285#[cfg(test)]
286#[tokio::test]
287async fn test_css_query_select_map_streamed() {
288    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
289
290    let data = css_query_select_map_streamed(
291        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
292        &build_selectors(map),
293    )
294    .await;
295
296    assert!(!data.is_empty(), "CSS extraction failed",);
297}
298
299#[test]
300fn test_css_query_select_map() {
301    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
302    let data = css_query_select_map(
303        r#"<html><body><ul class="list">Test</ul></body></html>"#,
304        &build_selectors(map),
305    );
306
307    assert!(!data.is_empty(), "CSS extraction failed",);
308}
309
310#[cfg(test)]
311#[tokio::test]
312async fn test_css_query_select_map_streamed_multi_join() {
313    let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
314    let data = css_query_select_map_streamed(
315        r#"<html>
316            <body>
317                <ul class="list"><li>First</li></ul>
318                <ul class="sub-list"><li>Second</li></ul>
319            </body>
320        </html>"#,
321        &build_selectors(map),
322    )
323    .await;
324
325    assert!(!data.is_empty(), "CSS extraction failed");
326}
327
328#[cfg(test)]
329#[tokio::test]
330async fn test_xpath_query_select_map_streamed() {
331    let map = QueryCSSMap::from([(
332        "list",
333        QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
334    )]);
335    let selectors = build_selectors(map);
336    let data = css_query_select_map_streamed(
337        r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
338        &selectors,
339    )
340    .await;
341
342    assert!(!data.is_empty(), "Xpath extraction failed",);
343}