1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8
9#[derive(Default, Debug, Clone)]
11pub struct DocumentSelectors<K> {
12 pub css: HashMap<K, Vec<Selector>>,
14 pub xpath: HashMap<K, Vec<String>>,
16}
17
18type CSSQueryMap = HashMap<String, Vec<String>>;
20
21lazy_static! {
22 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
24}
25
26fn is_valid_xpath(expression: &str) -> bool {
28 match XPATH_FACTORY.build(expression) {
29 Ok(Some(_)) => true,
30 Ok(None) => false,
31 Err(_) => false,
32 }
33}
34
35pub async fn css_query_select_map_streamed<K>(
37 html: &str,
38 selectors: &DocumentSelectors<K>,
39) -> CSSQueryMap
40where
41 K: AsRef<str> + Eq + Hash + Sized,
42{
43 let mut map: CSSQueryMap = HashMap::new();
44
45 if !selectors.css.is_empty() {
46 let fragment = Box::new(Html::parse_document(html));
47
48 for selector in &selectors.css {
49 for s in selector.1 {
50 for element in fragment.select(s) {
51 process_selector::<K>(element, selector.0, &mut map);
52 }
53 }
54 }
55 }
56
57 if !selectors.xpath.is_empty() {
58 if let Ok(package) = parser::parse(html) {
59 let document = Box::new(package.as_document());
60
61 for selector in selectors.xpath.iter() {
62 for s in selector.1 {
63 if let Ok(value) = evaluate_xpath(&document, s) {
64 let text = value.into_string();
65
66 if !text.is_empty() {
67 match map.entry(selector.0.as_ref().to_string()) {
68 Entry::Occupied(mut entry) => entry.get_mut().push(text),
69 Entry::Vacant(entry) => {
70 entry.insert(vec![text]);
71 }
72 }
73 }
74 };
75 }
76 }
77 };
78 }
79
80 for items in map.values_mut() {
81 items.dedup();
82 }
83
84 map
85}
86
87pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
89where
90 K: AsRef<str> + Eq + Hash + Sized,
91{
92 let mut map: CSSQueryMap = HashMap::new();
93
94 if !selectors.css.is_empty() {
95 let fragment = Box::new(Html::parse_document(html));
96
97 for selector in selectors.css.iter() {
98 for s in selector.1 {
99 for element in fragment.select(s) {
100 process_selector::<K>(element, selector.0, &mut map);
101 }
102 }
103 }
104 }
105
106 if !selectors.xpath.is_empty() {
107 if let Ok(package) = parser::parse(html) {
108 let document = package.as_document();
109
110 for selector in selectors.xpath.iter() {
111 for s in selector.1 {
112 if let Ok(value) = evaluate_xpath(&document, s) {
113 let text = value.into_string();
114
115 if !text.is_empty() {
116 match map.entry(selector.0.as_ref().to_string()) {
117 Entry::Occupied(mut entry) => entry.get_mut().push(text),
118 Entry::Vacant(entry) => {
119 entry.insert(vec![text]);
120 }
121 }
122 }
123 };
124 }
125 }
126 };
127 }
128
129 map
130}
131
132fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
134where
135 K: AsRef<str> + Eq + Hash + Sized,
136{
137 let name = name.as_ref();
138 let element_name = element.value().name();
139
140 let text = if element_name == "meta" {
141 element.attr("content").unwrap_or_default().into()
142 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
143 match element.attr(if element_name == "link" {
144 "href"
145 } else {
146 "src"
147 }) {
148 Some(href) => href.into(),
149 _ => clean_element_text(&element),
150 }
151 } else if element_name == "img" || element_name == "source" {
152 let mut img_text = String::new();
153
154 if let Some(src) = element.attr("src") {
155 if !src.is_empty() {
156 img_text.push('[');
157 img_text.push_str(src.trim());
158 img_text.push(']');
159 }
160 }
161 if let Some(alt) = element.attr("alt") {
162 if !alt.is_empty() {
163 if img_text.is_empty() {
164 img_text.push_str(alt);
165 } else {
166 img_text.push('(');
167 img_text.push('"');
168 img_text.push_str(alt);
169 img_text.push('"');
170 img_text.push(')');
171 }
172 }
173 }
174
175 img_text
176 } else {
177 clean_element_text(&element)
178 };
179
180 if !text.is_empty() {
181 match map.entry(name.to_string()) {
182 Entry::Occupied(mut entry) => entry.get_mut().push(text),
183 Entry::Vacant(entry) => {
184 entry.insert(vec![text]);
185 }
186 }
187 }
188}
189
190pub fn clean_element_text(element: &ElementRef) -> String {
192 element.text().collect::<Vec<_>>().join(" ")
193}
194
195pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
197where
198 K: AsRef<str> + Eq + Hash + Clone + Debug,
199 V: AsRef<str> + Debug + AsRef<str>,
200 S: IntoIterator<Item = V>,
201{
202 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
203 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
204
205 for (key, selector_set) in selectors {
206 let mut selectors_vec = Vec::new();
207 let mut selectors_vec_xpath = Vec::new();
208
209 for selector_str in selector_set {
210 match Selector::parse(selector_str.as_ref()) {
211 Ok(selector) => selectors_vec.push(selector),
212 Err(err) => {
213 if is_valid_xpath(selector_str.as_ref()) {
214 selectors_vec_xpath.push(selector_str.as_ref().to_string())
215 } else {
216 warn!(
217 "Failed to parse selector '{}': {:?}",
218 selector_str.as_ref(),
219 err
220 )
221 }
222 }
223 }
224 }
225
226 let has_css_selectors = !selectors_vec.is_empty();
227 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
228
229 if has_css_selectors && !has_xpath_selectors {
230 valid_selectors.insert(key, selectors_vec);
231 } else if !has_css_selectors && has_xpath_selectors {
232 valid_selectors_xpath.insert(key, selectors_vec_xpath);
233 } else {
234 if has_css_selectors {
235 valid_selectors.insert(key.clone(), selectors_vec);
236 }
237 if has_xpath_selectors {
238 valid_selectors_xpath.insert(key, selectors_vec_xpath);
239 }
240 }
241 }
242
243 DocumentSelectors {
244 css: valid_selectors,
245 xpath: valid_selectors_xpath,
246 }
247}
248
249#[cfg(not(feature = "indexset"))]
251pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
252where
253 K: AsRef<str> + Eq + Hash + Clone + Debug,
254 V: AsRef<str> + Debug + AsRef<str>,
255{
256 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
257}
258
259#[cfg(feature = "indexset")]
261pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
262where
263 K: AsRef<str> + Eq + Hash + Clone + Debug,
264 V: AsRef<str> + Debug + AsRef<str>,
265{
266 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
267}
268
269#[cfg(not(feature = "indexset"))]
270pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
271#[cfg(feature = "indexset")]
272pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
273#[cfg(not(feature = "indexset"))]
274pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
275#[cfg(feature = "indexset")]
276pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
277
278#[cfg(test)]
279#[tokio::test]
280async fn test_css_query_select_map_streamed() {
281 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
282
283 let data = css_query_select_map_streamed(
284 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
285 &build_selectors(map),
286 )
287 .await;
288
289 assert!(!data.is_empty(), "CSS extraction failed",);
290}
291
292#[test]
293fn test_css_query_select_map() {
294 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
295 let data = css_query_select_map(
296 r#"<html><body><ul class="list">Test</ul></body></html>"#,
297 &build_selectors(map),
298 );
299
300 assert!(!data.is_empty(), "CSS extraction failed",);
301}
302
303#[cfg(test)]
304#[tokio::test]
305async fn test_css_query_select_map_streamed_multi_join() {
306 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
307 let data = css_query_select_map_streamed(
308 r#"<html>
309 <body>
310 <ul class="list"><li>First</li></ul>
311 <ul class="sub-list"><li>Second</li></ul>
312 </body>
313 </html>"#,
314 &build_selectors(map),
315 )
316 .await;
317
318 assert!(!data.is_empty(), "CSS extraction failed");
319}
320
321#[cfg(test)]
322#[tokio::test]
323async fn test_xpath_query_select_map_streamed() {
324 let map = QueryCSSMap::from([(
325 "list",
326 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
327 )]);
328 let selectors = build_selectors(map);
329 let data = css_query_select_map_streamed(
330 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
331 &selectors,
332 )
333 .await;
334
335 assert!(!data.is_empty(), "Xpath extraction failed",);
336}
337
338#[cfg(test)]
339mod tests {
340 use super::*;
341
342 #[test]
343 fn test_css_query_empty_html() {
344 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
345 let data = css_query_select_map("", &build_selectors(map));
346 assert!(data.is_empty());
347 }
348
349 #[test]
350 fn test_css_query_no_matches() {
351 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
352 let data = css_query_select_map(
353 r#"<html><body><p class="other">Hello</p></body></html>"#,
354 &build_selectors(map),
355 );
356 assert!(data.is_empty());
357 }
358
359 #[test]
360 fn test_build_selectors_invalid_css() {
361 let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
362 let selectors = build_selectors(map);
363 assert!(selectors.css.is_empty());
365 }
366
367 #[test]
368 fn test_build_selectors_mixed_css_xpath() {
369 let map = QueryCSSMap::from([(
370 "mixed",
371 QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
372 )]);
373 let selectors = build_selectors(map);
374 let has_css = selectors.css.contains_key("mixed");
376 let has_xpath = selectors.xpath.contains_key("mixed");
377 assert!(has_css || has_xpath);
378 }
379
380 #[test]
381 fn test_css_query_special_characters() {
382 let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
383 let data = css_query_select_map(
384 r#"<html><body><p>Hello & "world" <test></p></body></html>"#,
385 &build_selectors(map),
386 );
387 assert!(!data.is_empty());
388 let values = data.get("content").unwrap();
389 assert!(!values.is_empty());
390 }
391
392 #[test]
393 fn test_clean_element_text_basic() {
394 let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
395 let selector = Selector::parse("p").unwrap();
396 if let Some(element) = html.select(&selector).next() {
397 let text = clean_element_text(&element);
398 assert!(text.contains("Hello"));
399 assert!(text.contains("World"));
400 }
401 }
402
403 #[test]
404 fn test_process_selector_img_element() {
405 let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
406 let selector = Selector::parse("img").unwrap();
407 let mut map: HashMap<String, Vec<String>> = HashMap::new();
408
409 if let Some(element) = html.select(&selector).next() {
410 process_selector::<&str>(element, &"image", &mut map);
411 }
412 assert!(map.contains_key("image"));
413 let vals = &map["image"];
414 assert!(!vals.is_empty());
415 assert!(vals[0].contains("photo.jpg"));
417 }
418
419 #[test]
420 fn test_process_selector_meta_element() {
421 let html = Html::parse_document(
422 r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
423 );
424 let selector = Selector::parse("meta[name='description']").unwrap();
425 let mut map: HashMap<String, Vec<String>> = HashMap::new();
426
427 if let Some(element) = html.select(&selector).next() {
428 process_selector::<&str>(element, &"desc", &mut map);
429 }
430 assert!(map.contains_key("desc"));
431 assert_eq!(map["desc"][0], "Test description");
432 }
433}