1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13 pub css: HashMap<K, Vec<Selector>>,
15 pub xpath: HashMap<K, Vec<String>>,
17}
18
19#[cfg(feature = "transformations")]
20pub use spider_transformations;
21
22type CSSQueryMap = HashMap<String, Vec<String>>;
24
25lazy_static! {
26 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
28}
29
30fn is_valid_xpath(expression: &str) -> bool {
32 match XPATH_FACTORY.build(expression) {
33 Ok(Some(_)) => true,
34 Ok(None) => false,
35 Err(_) => false,
36 }
37}
38
39pub async fn css_query_select_map_streamed<K>(
41 html: &str,
42 selectors: &DocumentSelectors<K>,
43) -> CSSQueryMap
44where
45 K: AsRef<str> + Eq + Hash + Sized,
46{
47 let mut map: CSSQueryMap = HashMap::new();
48
49 if !selectors.css.is_empty() {
50 let mut stream = tokio_stream::iter(&selectors.css);
51 let fragment = Box::new(Html::parse_document(html));
52
53 while let Some(selector) = stream.next().await {
54 for s in selector.1 {
55 for element in fragment.select(s) {
56 process_selector::<K>(element, selector.0, &mut map);
57 }
58 }
59 }
60 }
61
62 if !selectors.xpath.is_empty() {
63 if let Ok(package) = parser::parse(html) {
64 let document = Box::new(package.as_document());
65
66 for selector in selectors.xpath.iter() {
67 for s in selector.1 {
68 if let Ok(value) = evaluate_xpath(&document, s) {
69 let text = value.into_string();
70
71 if !text.is_empty() {
72 match map.entry(selector.0.as_ref().to_string()) {
73 Entry::Occupied(mut entry) => entry.get_mut().push(text),
74 Entry::Vacant(entry) => {
75 entry.insert(vec![text]);
76 }
77 }
78 }
79 };
80 }
81 }
82 };
83 }
84
85 for items in map.values_mut() {
86 items.dedup();
87 }
88
89 map
90}
91
92pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
94where
95 K: AsRef<str> + Eq + Hash + Sized,
96{
97 let mut map: CSSQueryMap = HashMap::new();
98
99 if !selectors.css.is_empty() {
100 let fragment = Box::new(Html::parse_document(html));
101
102 for selector in selectors.css.iter() {
103 for s in selector.1 {
104 for element in fragment.select(s) {
105 process_selector::<K>(element, selector.0, &mut map);
106 }
107 }
108 }
109 }
110
111 if !selectors.xpath.is_empty() {
112 if let Ok(package) = parser::parse(html) {
113 let document = package.as_document();
114
115 for selector in selectors.xpath.iter() {
116 for s in selector.1 {
117 if let Ok(value) = evaluate_xpath(&document, s) {
118 let text = value.into_string();
119
120 if !text.is_empty() {
121 match map.entry(selector.0.as_ref().to_string()) {
122 Entry::Occupied(mut entry) => entry.get_mut().push(text),
123 Entry::Vacant(entry) => {
124 entry.insert(vec![text]);
125 }
126 }
127 }
128 };
129 }
130 }
131 };
132 }
133
134 map
135}
136
137fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
139where
140 K: AsRef<str> + Eq + Hash + Sized,
141{
142 let name = name.as_ref();
143 let element_name = element.value().name();
144
145 let text = if element_name == "meta" {
146 element.attr("content").unwrap_or_default().into()
147 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
148 match element.attr(if element_name == "link" {
149 "href"
150 } else {
151 "src"
152 }) {
153 Some(href) => href.into(),
154 _ => clean_element_text(&element),
155 }
156 } else if element_name == "img" || element_name == "source" {
157 let mut img_text = String::new();
158
159 if let Some(src) = element.attr("src") {
160 if !src.is_empty() {
161 img_text.push('[');
162 img_text.push_str(src.trim());
163 img_text.push(']');
164 }
165 }
166 if let Some(alt) = element.attr("alt") {
167 if !alt.is_empty() {
168 if img_text.is_empty() {
169 img_text.push_str(alt);
170 } else {
171 img_text.push('(');
172 img_text.push('"');
173 img_text.push_str(alt);
174 img_text.push('"');
175 img_text.push(')');
176 }
177 }
178 }
179
180 img_text
181 } else {
182 clean_element_text(&element)
183 };
184
185 if !text.is_empty() {
186 match map.entry(name.to_string()) {
187 Entry::Occupied(mut entry) => entry.get_mut().push(text),
188 Entry::Vacant(entry) => {
189 entry.insert(vec![text]);
190 }
191 }
192 }
193}
194
195pub fn clean_element_text(element: &ElementRef) -> String {
197 element.text().collect::<Vec<_>>().join(" ")
198}
199
200pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
202where
203 K: AsRef<str> + Eq + Hash + Clone + Debug,
204 V: AsRef<str> + Debug + AsRef<str>,
205 S: IntoIterator<Item = V>,
206{
207 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
208 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
209
210 for (key, selector_set) in selectors {
211 let mut selectors_vec = Vec::new();
212 let mut selectors_vec_xpath = Vec::new();
213
214 for selector_str in selector_set {
215 match Selector::parse(selector_str.as_ref()) {
216 Ok(selector) => selectors_vec.push(selector),
217 Err(err) => {
218 if is_valid_xpath(selector_str.as_ref()) {
219 selectors_vec_xpath.push(selector_str.as_ref().to_string())
220 } else {
221 warn!(
222 "{}",
223 format!(
224 "Failed to parse selector '{}': {:?}",
225 selector_str.as_ref(),
226 err
227 ),
228 )
229 }
230 }
231 }
232 }
233
234 let has_css_selectors = !selectors_vec.is_empty();
235 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
236
237 if has_css_selectors && !has_xpath_selectors {
238 valid_selectors.insert(key, selectors_vec);
239 } else if !has_css_selectors && has_xpath_selectors {
240 valid_selectors_xpath.insert(key, selectors_vec_xpath);
241 } else {
242 if has_css_selectors {
243 valid_selectors.insert(key.clone(), selectors_vec);
244 }
245 if has_xpath_selectors {
246 valid_selectors_xpath.insert(key, selectors_vec_xpath);
247 }
248 }
249 }
250
251 DocumentSelectors {
252 css: valid_selectors,
253 xpath: valid_selectors_xpath,
254 }
255}
256
257#[cfg(not(feature = "indexset"))]
259pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
260where
261 K: AsRef<str> + Eq + Hash + Clone + Debug,
262 V: AsRef<str> + Debug + AsRef<str>,
263{
264 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
265}
266
267#[cfg(feature = "indexset")]
269pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
270where
271 K: AsRef<str> + Eq + Hash + Clone + Debug,
272 V: AsRef<str> + Debug + AsRef<str>,
273{
274 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
275}
276
277#[cfg(not(feature = "indexset"))]
278pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
279#[cfg(feature = "indexset")]
280pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
281#[cfg(not(feature = "indexset"))]
282pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
283#[cfg(feature = "indexset")]
284pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
285
286#[cfg(test)]
287#[tokio::test]
288async fn test_css_query_select_map_streamed() {
289 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
290
291 let data = css_query_select_map_streamed(
292 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
293 &build_selectors(map),
294 )
295 .await;
296
297 assert!(!data.is_empty(), "CSS extraction failed",);
298}
299
300#[test]
301fn test_css_query_select_map() {
302 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
303 let data = css_query_select_map(
304 r#"<html><body><ul class="list">Test</ul></body></html>"#,
305 &build_selectors(map),
306 );
307
308 assert!(!data.is_empty(), "CSS extraction failed",);
309}
310
311#[cfg(test)]
312#[tokio::test]
313async fn test_css_query_select_map_streamed_multi_join() {
314 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
315 let data = css_query_select_map_streamed(
316 r#"<html>
317 <body>
318 <ul class="list"><li>First</li></ul>
319 <ul class="sub-list"><li>Second</li></ul>
320 </body>
321 </html>"#,
322 &build_selectors(map),
323 )
324 .await;
325
326 assert!(!data.is_empty(), "CSS extraction failed");
327}
328
329#[cfg(test)]
330#[tokio::test]
331async fn test_xpath_query_select_map_streamed() {
332 let map = QueryCSSMap::from([(
333 "list",
334 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
335 )]);
336 let selectors = build_selectors(map);
337 let data = css_query_select_map_streamed(
338 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
339 &selectors,
340 )
341 .await;
342
343 assert!(!data.is_empty(), "Xpath extraction failed",);
344}