1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8
9#[derive(Default, Debug, Clone)]
11pub struct DocumentSelectors<K> {
12 pub css: HashMap<K, Vec<Selector>>,
14 pub xpath: HashMap<K, Vec<String>>,
16}
17
18type CSSQueryMap = HashMap<String, Vec<String>>;
20
21lazy_static! {
22 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
24}
25
26fn is_valid_xpath(expression: &str) -> bool {
28 match XPATH_FACTORY.build(expression) {
29 Ok(Some(_)) => true,
30 Ok(None) => false,
31 Err(_) => false,
32 }
33}
34
35pub async fn css_query_select_map_streamed<K>(
37 html: &str,
38 selectors: &DocumentSelectors<K>,
39) -> CSSQueryMap
40where
41 K: AsRef<str> + Eq + Hash + Sized,
42{
43 let mut map: CSSQueryMap = HashMap::with_capacity(selectors.css.len() + selectors.xpath.len());
44
45 if !selectors.css.is_empty() {
46 let fragment = Box::new(Html::parse_document(html));
47
48 for selector in &selectors.css {
49 for s in selector.1 {
50 for element in fragment.select(s) {
51 process_selector::<K>(element, selector.0, &mut map);
52 }
53 }
54 }
55 }
56
57 if !selectors.xpath.is_empty() {
58 if let Ok(package) = parser::parse(html) {
59 let document = Box::new(package.as_document());
60
61 for selector in selectors.xpath.iter() {
62 for s in selector.1 {
63 if let Ok(value) = evaluate_xpath(&document, s) {
64 let text = value.into_string();
65
66 if !text.is_empty() {
67 match map.entry(selector.0.as_ref().to_string()) {
68 Entry::Occupied(mut entry) => entry.get_mut().push(text),
69 Entry::Vacant(entry) => {
70 entry.insert(vec![text]);
71 }
72 }
73 }
74 };
75 }
76 }
77 };
78 }
79
80 for items in map.values_mut() {
81 items.dedup();
82 }
83
84 map
85}
86
87pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
89where
90 K: AsRef<str> + Eq + Hash + Sized,
91{
92 let mut map: CSSQueryMap = HashMap::with_capacity(selectors.css.len() + selectors.xpath.len());
93
94 if !selectors.css.is_empty() {
95 let fragment = Box::new(Html::parse_document(html));
96
97 for selector in selectors.css.iter() {
98 for s in selector.1 {
99 for element in fragment.select(s) {
100 process_selector::<K>(element, selector.0, &mut map);
101 }
102 }
103 }
104 }
105
106 if !selectors.xpath.is_empty() {
107 if let Ok(package) = parser::parse(html) {
108 let document = package.as_document();
109
110 for selector in selectors.xpath.iter() {
111 for s in selector.1 {
112 if let Ok(value) = evaluate_xpath(&document, s) {
113 let text = value.into_string();
114
115 if !text.is_empty() {
116 match map.entry(selector.0.as_ref().to_string()) {
117 Entry::Occupied(mut entry) => entry.get_mut().push(text),
118 Entry::Vacant(entry) => {
119 entry.insert(vec![text]);
120 }
121 }
122 }
123 };
124 }
125 }
126 };
127 }
128
129 map
130}
131
132fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
134where
135 K: AsRef<str> + Eq + Hash + Sized,
136{
137 let name = name.as_ref();
138 let element_name = element.value().name();
139
140 let text = if element_name == "meta" {
141 element.attr("content").unwrap_or_default().into()
142 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
143 match element.attr(if element_name == "link" {
144 "href"
145 } else {
146 "src"
147 }) {
148 Some(href) => href.into(),
149 _ => clean_element_text(&element),
150 }
151 } else if element_name == "img" || element_name == "source" {
152 let mut img_text = String::new();
153
154 if let Some(src) = element.attr("src") {
155 if !src.is_empty() {
156 img_text.push('[');
157 img_text.push_str(src.trim());
158 img_text.push(']');
159 }
160 }
161 if let Some(alt) = element.attr("alt") {
162 if !alt.is_empty() {
163 if img_text.is_empty() {
164 img_text.push_str(alt);
165 } else {
166 img_text.push('(');
167 img_text.push('"');
168 img_text.push_str(alt);
169 img_text.push('"');
170 img_text.push(')');
171 }
172 }
173 }
174
175 img_text
176 } else {
177 clean_element_text(&element)
178 };
179
180 if !text.is_empty() {
181 match map.entry(name.to_string()) {
182 Entry::Occupied(mut entry) => entry.get_mut().push(text),
183 Entry::Vacant(entry) => {
184 entry.insert(vec![text]);
185 }
186 }
187 }
188}
189
190pub fn clean_element_text(element: &ElementRef) -> String {
192 element.text().collect::<Vec<_>>().join(" ")
193}
194
195pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
197where
198 K: AsRef<str> + Eq + Hash + Clone + Debug,
199 V: AsRef<str> + Debug + AsRef<str>,
200 S: IntoIterator<Item = V>,
201{
202 let cap = selectors.len();
203 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::with_capacity(cap);
204 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::with_capacity(cap);
205
206 for (key, selector_set) in selectors {
207 let iter = selector_set.into_iter();
208 let (size_hint, _) = iter.size_hint();
209 let mut selectors_vec = Vec::with_capacity(size_hint);
210 let mut selectors_vec_xpath = Vec::new();
211
212 for selector_str in iter {
213 match Selector::parse(selector_str.as_ref()) {
214 Ok(selector) => selectors_vec.push(selector),
215 Err(err) => {
216 if is_valid_xpath(selector_str.as_ref()) {
217 selectors_vec_xpath.push(selector_str.as_ref().to_string())
218 } else {
219 warn!(
220 "Failed to parse selector '{}': {:?}",
221 selector_str.as_ref(),
222 err
223 )
224 }
225 }
226 }
227 }
228
229 let has_css_selectors = !selectors_vec.is_empty();
230 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
231
232 if has_css_selectors && !has_xpath_selectors {
233 valid_selectors.insert(key, selectors_vec);
234 } else if !has_css_selectors && has_xpath_selectors {
235 valid_selectors_xpath.insert(key, selectors_vec_xpath);
236 } else {
237 if has_css_selectors {
238 valid_selectors.insert(key.clone(), selectors_vec);
239 }
240 if has_xpath_selectors {
241 valid_selectors_xpath.insert(key, selectors_vec_xpath);
242 }
243 }
244 }
245
246 DocumentSelectors {
247 css: valid_selectors,
248 xpath: valid_selectors_xpath,
249 }
250}
251
252#[cfg(not(feature = "indexset"))]
254pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
255where
256 K: AsRef<str> + Eq + Hash + Clone + Debug,
257 V: AsRef<str> + Debug + AsRef<str>,
258{
259 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
260}
261
262#[cfg(feature = "indexset")]
264pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
265where
266 K: AsRef<str> + Eq + Hash + Clone + Debug,
267 V: AsRef<str> + Debug + AsRef<str>,
268{
269 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
270}
271
272#[cfg(not(feature = "indexset"))]
273pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
274#[cfg(feature = "indexset")]
275pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
276#[cfg(not(feature = "indexset"))]
277pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
278#[cfg(feature = "indexset")]
279pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
280
281#[cfg(test)]
282#[tokio::test]
283async fn test_css_query_select_map_streamed() {
284 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
285
286 let data = css_query_select_map_streamed(
287 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
288 &build_selectors(map),
289 )
290 .await;
291
292 assert!(!data.is_empty(), "CSS extraction failed",);
293}
294
295#[test]
296fn test_css_query_select_map() {
297 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
298 let data = css_query_select_map(
299 r#"<html><body><ul class="list">Test</ul></body></html>"#,
300 &build_selectors(map),
301 );
302
303 assert!(!data.is_empty(), "CSS extraction failed",);
304}
305
306#[cfg(test)]
307#[tokio::test]
308async fn test_css_query_select_map_streamed_multi_join() {
309 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
310 let data = css_query_select_map_streamed(
311 r#"<html>
312 <body>
313 <ul class="list"><li>First</li></ul>
314 <ul class="sub-list"><li>Second</li></ul>
315 </body>
316 </html>"#,
317 &build_selectors(map),
318 )
319 .await;
320
321 assert!(!data.is_empty(), "CSS extraction failed");
322}
323
324#[cfg(test)]
325#[tokio::test]
326async fn test_xpath_query_select_map_streamed() {
327 let map = QueryCSSMap::from([(
328 "list",
329 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
330 )]);
331 let selectors = build_selectors(map);
332 let data = css_query_select_map_streamed(
333 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
334 &selectors,
335 )
336 .await;
337
338 assert!(!data.is_empty(), "Xpath extraction failed",);
339}
340
341#[cfg(test)]
342mod tests {
343 use super::*;
344
345 #[test]
346 fn test_css_query_empty_html() {
347 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
348 let data = css_query_select_map("", &build_selectors(map));
349 assert!(data.is_empty());
350 }
351
352 #[test]
353 fn test_css_query_no_matches() {
354 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
355 let data = css_query_select_map(
356 r#"<html><body><p class="other">Hello</p></body></html>"#,
357 &build_selectors(map),
358 );
359 assert!(data.is_empty());
360 }
361
362 #[test]
363 fn test_build_selectors_invalid_css() {
364 let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
365 let selectors = build_selectors(map);
366 assert!(selectors.css.is_empty());
368 }
369
370 #[test]
371 fn test_build_selectors_mixed_css_xpath() {
372 let map = QueryCSSMap::from([(
373 "mixed",
374 QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
375 )]);
376 let selectors = build_selectors(map);
377 let has_css = selectors.css.contains_key("mixed");
379 let has_xpath = selectors.xpath.contains_key("mixed");
380 assert!(has_css || has_xpath);
381 }
382
383 #[test]
384 fn test_css_query_special_characters() {
385 let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
386 let data = css_query_select_map(
387 r#"<html><body><p>Hello & "world" <test></p></body></html>"#,
388 &build_selectors(map),
389 );
390 assert!(!data.is_empty());
391 let values = data.get("content").unwrap();
392 assert!(!values.is_empty());
393 }
394
395 #[test]
396 fn test_clean_element_text_basic() {
397 let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
398 let selector = Selector::parse("p").unwrap();
399 if let Some(element) = html.select(&selector).next() {
400 let text = clean_element_text(&element);
401 assert!(text.contains("Hello"));
402 assert!(text.contains("World"));
403 }
404 }
405
406 #[test]
407 fn test_process_selector_img_element() {
408 let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
409 let selector = Selector::parse("img").unwrap();
410 let mut map: HashMap<String, Vec<String>> = HashMap::new();
411
412 if let Some(element) = html.select(&selector).next() {
413 process_selector::<&str>(element, &"image", &mut map);
414 }
415 assert!(map.contains_key("image"));
416 let vals = &map["image"];
417 assert!(!vals.is_empty());
418 assert!(vals[0].contains("photo.jpg"));
420 }
421
422 #[test]
423 fn test_process_selector_meta_element() {
424 let html = Html::parse_document(
425 r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
426 );
427 let selector = Selector::parse("meta[name='description']").unwrap();
428 let mut map: HashMap<String, Vec<String>> = HashMap::new();
429
430 if let Some(element) = html.select(&selector).next() {
431 process_selector::<&str>(element, &"desc", &mut map);
432 }
433 assert!(map.contains_key("desc"));
434 assert_eq!(map["desc"][0], "Test description");
435 }
436}