1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13 pub css: HashMap<K, Vec<Selector>>,
15 pub xpath: HashMap<K, Vec<String>>,
17}
18
19type CSSQueryMap = HashMap<String, Vec<String>>;
21
22lazy_static! {
23 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
25}
26
27fn is_valid_xpath(expression: &str) -> bool {
29 match XPATH_FACTORY.build(expression) {
30 Ok(Some(_)) => true,
31 Ok(None) => false,
32 Err(_) => false,
33 }
34}
35
36pub async fn css_query_select_map_streamed<K>(
38 html: &str,
39 selectors: &DocumentSelectors<K>,
40) -> CSSQueryMap
41where
42 K: AsRef<str> + Eq + Hash + Sized,
43{
44 let mut map: CSSQueryMap = HashMap::new();
45
46 if !selectors.css.is_empty() {
47 let mut stream = tokio_stream::iter(&selectors.css);
48 let fragment = Box::new(Html::parse_document(html));
49
50 while let Some(selector) = stream.next().await {
51 for s in selector.1 {
52 for element in fragment.select(s) {
53 process_selector::<K>(element, selector.0, &mut map);
54 }
55 }
56 }
57 }
58
59 if !selectors.xpath.is_empty() {
60 if let Ok(package) = parser::parse(html) {
61 let document = Box::new(package.as_document());
62
63 for selector in selectors.xpath.iter() {
64 for s in selector.1 {
65 if let Ok(value) = evaluate_xpath(&document, s) {
66 let text = value.into_string();
67
68 if !text.is_empty() {
69 match map.entry(selector.0.as_ref().to_string()) {
70 Entry::Occupied(mut entry) => entry.get_mut().push(text),
71 Entry::Vacant(entry) => {
72 entry.insert(vec![text]);
73 }
74 }
75 }
76 };
77 }
78 }
79 };
80 }
81
82 for items in map.values_mut() {
83 items.dedup();
84 }
85
86 map
87}
88
89pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
91where
92 K: AsRef<str> + Eq + Hash + Sized,
93{
94 let mut map: CSSQueryMap = HashMap::new();
95
96 if !selectors.css.is_empty() {
97 let fragment = Box::new(Html::parse_document(html));
98
99 for selector in selectors.css.iter() {
100 for s in selector.1 {
101 for element in fragment.select(s) {
102 process_selector::<K>(element, selector.0, &mut map);
103 }
104 }
105 }
106 }
107
108 if !selectors.xpath.is_empty() {
109 if let Ok(package) = parser::parse(html) {
110 let document = package.as_document();
111
112 for selector in selectors.xpath.iter() {
113 for s in selector.1 {
114 if let Ok(value) = evaluate_xpath(&document, s) {
115 let text = value.into_string();
116
117 if !text.is_empty() {
118 match map.entry(selector.0.as_ref().to_string()) {
119 Entry::Occupied(mut entry) => entry.get_mut().push(text),
120 Entry::Vacant(entry) => {
121 entry.insert(vec![text]);
122 }
123 }
124 }
125 };
126 }
127 }
128 };
129 }
130
131 map
132}
133
134fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
136where
137 K: AsRef<str> + Eq + Hash + Sized,
138{
139 let name = name.as_ref();
140 let element_name = element.value().name();
141
142 let text = if element_name == "meta" {
143 element.attr("content").unwrap_or_default().into()
144 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
145 match element.attr(if element_name == "link" {
146 "href"
147 } else {
148 "src"
149 }) {
150 Some(href) => href.into(),
151 _ => clean_element_text(&element),
152 }
153 } else if element_name == "img" || element_name == "source" {
154 let mut img_text = String::new();
155
156 if let Some(src) = element.attr("src") {
157 if !src.is_empty() {
158 img_text.push('[');
159 img_text.push_str(src.trim());
160 img_text.push(']');
161 }
162 }
163 if let Some(alt) = element.attr("alt") {
164 if !alt.is_empty() {
165 if img_text.is_empty() {
166 img_text.push_str(alt);
167 } else {
168 img_text.push('(');
169 img_text.push('"');
170 img_text.push_str(alt);
171 img_text.push('"');
172 img_text.push(')');
173 }
174 }
175 }
176
177 img_text
178 } else {
179 clean_element_text(&element)
180 };
181
182 if !text.is_empty() {
183 match map.entry(name.to_string()) {
184 Entry::Occupied(mut entry) => entry.get_mut().push(text),
185 Entry::Vacant(entry) => {
186 entry.insert(vec![text]);
187 }
188 }
189 }
190}
191
192pub fn clean_element_text(element: &ElementRef) -> String {
194 element.text().collect::<Vec<_>>().join(" ")
195}
196
197pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
199where
200 K: AsRef<str> + Eq + Hash + Clone + Debug,
201 V: AsRef<str> + Debug + AsRef<str>,
202 S: IntoIterator<Item = V>,
203{
204 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
205 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
206
207 for (key, selector_set) in selectors {
208 let mut selectors_vec = Vec::new();
209 let mut selectors_vec_xpath = Vec::new();
210
211 for selector_str in selector_set {
212 match Selector::parse(selector_str.as_ref()) {
213 Ok(selector) => selectors_vec.push(selector),
214 Err(err) => {
215 if is_valid_xpath(selector_str.as_ref()) {
216 selectors_vec_xpath.push(selector_str.as_ref().to_string())
217 } else {
218 warn!(
219 "Failed to parse selector '{}': {:?}",
220 selector_str.as_ref(),
221 err
222 )
223 }
224 }
225 }
226 }
227
228 let has_css_selectors = !selectors_vec.is_empty();
229 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
230
231 if has_css_selectors && !has_xpath_selectors {
232 valid_selectors.insert(key, selectors_vec);
233 } else if !has_css_selectors && has_xpath_selectors {
234 valid_selectors_xpath.insert(key, selectors_vec_xpath);
235 } else {
236 if has_css_selectors {
237 valid_selectors.insert(key.clone(), selectors_vec);
238 }
239 if has_xpath_selectors {
240 valid_selectors_xpath.insert(key, selectors_vec_xpath);
241 }
242 }
243 }
244
245 DocumentSelectors {
246 css: valid_selectors,
247 xpath: valid_selectors_xpath,
248 }
249}
250
251#[cfg(not(feature = "indexset"))]
253pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
254where
255 K: AsRef<str> + Eq + Hash + Clone + Debug,
256 V: AsRef<str> + Debug + AsRef<str>,
257{
258 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
259}
260
261#[cfg(feature = "indexset")]
263pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
264where
265 K: AsRef<str> + Eq + Hash + Clone + Debug,
266 V: AsRef<str> + Debug + AsRef<str>,
267{
268 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
269}
270
271#[cfg(not(feature = "indexset"))]
272pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
273#[cfg(feature = "indexset")]
274pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
275#[cfg(not(feature = "indexset"))]
276pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
277#[cfg(feature = "indexset")]
278pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
279
280#[cfg(test)]
281#[tokio::test]
282async fn test_css_query_select_map_streamed() {
283 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
284
285 let data = css_query_select_map_streamed(
286 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
287 &build_selectors(map),
288 )
289 .await;
290
291 assert!(!data.is_empty(), "CSS extraction failed",);
292}
293
294#[test]
295fn test_css_query_select_map() {
296 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
297 let data = css_query_select_map(
298 r#"<html><body><ul class="list">Test</ul></body></html>"#,
299 &build_selectors(map),
300 );
301
302 assert!(!data.is_empty(), "CSS extraction failed",);
303}
304
305#[cfg(test)]
306#[tokio::test]
307async fn test_css_query_select_map_streamed_multi_join() {
308 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
309 let data = css_query_select_map_streamed(
310 r#"<html>
311 <body>
312 <ul class="list"><li>First</li></ul>
313 <ul class="sub-list"><li>Second</li></ul>
314 </body>
315 </html>"#,
316 &build_selectors(map),
317 )
318 .await;
319
320 assert!(!data.is_empty(), "CSS extraction failed");
321}
322
323#[cfg(test)]
324#[tokio::test]
325async fn test_xpath_query_select_map_streamed() {
326 let map = QueryCSSMap::from([(
327 "list",
328 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
329 )]);
330 let selectors = build_selectors(map);
331 let data = css_query_select_map_streamed(
332 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
333 &selectors,
334 )
335 .await;
336
337 assert!(!data.is_empty(), "Xpath extraction failed",);
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn test_css_query_empty_html() {
346 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".item"]))]);
347 let data = css_query_select_map("", &build_selectors(map));
348 assert!(data.is_empty());
349 }
350
351 #[test]
352 fn test_css_query_no_matches() {
353 let map = QueryCSSMap::from([("item", QueryCSSSelectSet::from([".nonexistent"]))]);
354 let data = css_query_select_map(
355 r#"<html><body><p class="other">Hello</p></body></html>"#,
356 &build_selectors(map),
357 );
358 assert!(data.is_empty());
359 }
360
361 #[test]
362 fn test_build_selectors_invalid_css() {
363 let map = QueryCSSMap::from([("bad", QueryCSSSelectSet::from(["[[[invalid"]))]);
364 let selectors = build_selectors(map);
365 assert!(selectors.css.is_empty());
367 }
368
369 #[test]
370 fn test_build_selectors_mixed_css_xpath() {
371 let map = QueryCSSMap::from([(
372 "mixed",
373 QueryCSSSelectSet::from([".valid-css", "//*[@class='xpath']"]),
374 )]);
375 let selectors = build_selectors(map);
376 let has_css = selectors.css.contains_key("mixed");
378 let has_xpath = selectors.xpath.contains_key("mixed");
379 assert!(has_css || has_xpath);
380 }
381
382 #[test]
383 fn test_css_query_special_characters() {
384 let map = QueryCSSMap::from([("content", QueryCSSSelectSet::from(["p"]))]);
385 let data = css_query_select_map(
386 r#"<html><body><p>Hello & "world" <test></p></body></html>"#,
387 &build_selectors(map),
388 );
389 assert!(!data.is_empty());
390 let values = data.get("content").unwrap();
391 assert!(!values.is_empty());
392 }
393
394 #[test]
395 fn test_clean_element_text_basic() {
396 let html = Html::parse_fragment("<p>Hello <b>World</b></p>");
397 let selector = Selector::parse("p").unwrap();
398 if let Some(element) = html.select(&selector).next() {
399 let text = clean_element_text(&element);
400 assert!(text.contains("Hello"));
401 assert!(text.contains("World"));
402 }
403 }
404
405 #[test]
406 fn test_process_selector_img_element() {
407 let html = Html::parse_fragment(r#"<img src="photo.jpg" alt="A photo">"#);
408 let selector = Selector::parse("img").unwrap();
409 let mut map: HashMap<String, Vec<String>> = HashMap::new();
410
411 if let Some(element) = html.select(&selector).next() {
412 process_selector::<&str>(element, &"image", &mut map);
413 }
414 assert!(map.contains_key("image"));
415 let vals = &map["image"];
416 assert!(!vals.is_empty());
417 assert!(vals[0].contains("photo.jpg"));
419 }
420
421 #[test]
422 fn test_process_selector_meta_element() {
423 let html = Html::parse_document(
424 r#"<html><head><meta name="description" content="Test description"></head><body></body></html>"#,
425 );
426 let selector = Selector::parse("meta[name='description']").unwrap();
427 let mut map: HashMap<String, Vec<String>> = HashMap::new();
428
429 if let Some(element) = html.select(&selector).next() {
430 process_selector::<&str>(element, &"desc", &mut map);
431 }
432 assert!(map.contains_key("desc"));
433 assert_eq!(map["desc"][0], "Test description");
434 }
435}