1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13 pub css: HashMap<K, Vec<Selector>>,
15 pub xpath: HashMap<K, Vec<String>>,
17}
18
19#[cfg(feature = "transformations")]
20pub use spider_transformations;
21
22type CSSQueryMap = HashMap<String, Vec<String>>;
24
25lazy_static! {
26 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
27}
28
29fn is_valid_xpath(expression: &str) -> bool {
31 match XPATH_FACTORY.build(expression) {
32 Ok(Some(_)) => true,
33 Ok(None) => false,
34 Err(_) => false,
35 }
36}
37
38pub async fn css_query_select_map_streamed<K>(
40 html: &str,
41 selectors: &DocumentSelectors<K>,
42) -> CSSQueryMap
43where
44 K: AsRef<str> + Eq + Hash + Sized,
45{
46 let mut map: CSSQueryMap = HashMap::new();
47
48 if !selectors.css.is_empty() {
49 let mut stream = tokio_stream::iter(&selectors.css);
50 let fragment = Box::new(Html::parse_document(html));
51
52 while let Some(selector) = stream.next().await {
53 for s in selector.1 {
54 for element in fragment.select(s) {
55 process_selector::<K>(element, selector.0, &mut map);
56 }
57 }
58 }
59 }
60
61 if !selectors.xpath.is_empty() {
62 if let Ok(package) = parser::parse(html) {
63 let document = Box::new(package.as_document());
64
65 for selector in selectors.xpath.iter() {
66 for s in selector.1 {
67 if let Ok(value) = evaluate_xpath(&document, s) {
68 let text = value.into_string();
69
70 if !text.is_empty() {
71 match map.entry(selector.0.as_ref().to_string()) {
72 Entry::Occupied(mut entry) => entry.get_mut().push(text),
73 Entry::Vacant(entry) => {
74 entry.insert(vec![text]);
75 }
76 }
77 }
78 };
79 }
80 }
81 };
82 }
83
84 for items in map.values_mut() {
85 items.dedup();
86 }
87
88 map
89}
90
91pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
93where
94 K: AsRef<str> + Eq + Hash + Sized,
95{
96 let mut map: CSSQueryMap = HashMap::new();
97
98 if !selectors.css.is_empty() {
99 let fragment = Box::new(Html::parse_document(html));
100
101 for selector in selectors.css.iter() {
102 for s in selector.1 {
103 for element in fragment.select(s) {
104 process_selector::<K>(element, selector.0, &mut map);
105 }
106 }
107 }
108 }
109
110 if !selectors.xpath.is_empty() {
111 if let Ok(package) = parser::parse(html) {
112 let document = package.as_document();
113
114 for selector in selectors.xpath.iter() {
115 for s in selector.1 {
116 if let Ok(value) = evaluate_xpath(&document, s) {
117 let text = value.into_string();
118
119 if !text.is_empty() {
120 match map.entry(selector.0.as_ref().to_string()) {
121 Entry::Occupied(mut entry) => entry.get_mut().push(text),
122 Entry::Vacant(entry) => {
123 entry.insert(vec![text]);
124 }
125 }
126 }
127 };
128 }
129 }
130 };
131 }
132
133 map
134}
135
136fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
138where
139 K: AsRef<str> + Eq + Hash + Sized,
140{
141 let name = name.as_ref();
142 let element_name = element.value().name();
143
144 let text = if element_name == "meta" {
145 element.attr("content").unwrap_or_default().into()
146 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
147 match element.attr(if element_name == "link" {
148 "href"
149 } else {
150 "src"
151 }) {
152 Some(href) => href.into(),
153 _ => clean_element_text(&element),
154 }
155 } else if element_name == "img" || element_name == "source" {
156 let mut img_text = String::new();
157
158 if let Some(src) = element.attr("src") {
159 if !src.is_empty() {
160 img_text.push('[');
161 img_text.push_str(src.trim());
162 img_text.push(']');
163 }
164 }
165 if let Some(alt) = element.attr("alt") {
166 if !alt.is_empty() {
167 if img_text.is_empty() {
168 img_text.push_str(alt);
169 } else {
170 img_text.push('(');
171 img_text.push('"');
172 img_text.push_str(alt);
173 img_text.push('"');
174 img_text.push(')');
175 }
176 }
177 }
178
179 img_text
180 } else {
181 clean_element_text(&element)
182 };
183
184 if !text.is_empty() {
185 match map.entry(name.to_string()) {
186 Entry::Occupied(mut entry) => entry.get_mut().push(text),
187 Entry::Vacant(entry) => {
188 entry.insert(vec![text]);
189 }
190 }
191 }
192}
193
194pub fn clean_element_text(element: &ElementRef) -> String {
196 element.text().collect::<Vec<_>>().join(" ")
197}
198
199pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
201where
202 K: AsRef<str> + Eq + Hash + Clone + Debug,
203 V: AsRef<str> + Debug + AsRef<str>,
204 S: IntoIterator<Item = V>,
205{
206 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
207 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
208
209 for (key, selector_set) in selectors {
210 let mut selectors_vec = Vec::new();
211 let mut selectors_vec_xpath = Vec::new();
212
213 for selector_str in selector_set {
214 match Selector::parse(selector_str.as_ref()) {
215 Ok(selector) => selectors_vec.push(selector),
216 Err(err) => {
217 if is_valid_xpath(selector_str.as_ref()) {
218 selectors_vec_xpath.push(selector_str.as_ref().to_string())
219 } else {
220 warn!(
221 "{}",
222 format!(
223 "Failed to parse selector '{}': {:?}",
224 selector_str.as_ref(),
225 err
226 ),
227 )
228 }
229 }
230 }
231 }
232
233 let has_css_selectors = !selectors_vec.is_empty();
234 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
235
236 if has_css_selectors && !has_xpath_selectors {
237 valid_selectors.insert(key, selectors_vec);
238 } else if !has_css_selectors && has_xpath_selectors {
239 valid_selectors_xpath.insert(key, selectors_vec_xpath);
240 } else {
241 if has_css_selectors {
242 valid_selectors.insert(key.clone(), selectors_vec);
243 }
244 if has_xpath_selectors {
245 valid_selectors_xpath.insert(key, selectors_vec_xpath);
246 }
247 }
248 }
249
250 DocumentSelectors {
251 css: valid_selectors,
252 xpath: valid_selectors_xpath,
253 }
254}
255
256#[cfg(not(feature = "indexset"))]
258pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
259where
260 K: AsRef<str> + Eq + Hash + Clone + Debug,
261 V: AsRef<str> + Debug + AsRef<str>,
262{
263 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
264}
265
266#[cfg(feature = "indexset")]
268pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
269where
270 K: AsRef<str> + Eq + Hash + Clone + Debug,
271 V: AsRef<str> + Debug + AsRef<str>,
272{
273 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
274}
275
276#[cfg(not(feature = "indexset"))]
277pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
278#[cfg(feature = "indexset")]
279pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
280#[cfg(not(feature = "indexset"))]
281pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
282#[cfg(feature = "indexset")]
283pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
284
285#[cfg(test)]
286#[tokio::test]
287async fn test_css_query_select_map_streamed() {
288 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
289
290 let data = css_query_select_map_streamed(
291 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
292 &build_selectors(map),
293 )
294 .await;
295
296 assert!(!data.is_empty(), "CSS extraction failed",);
297}
298
299#[test]
300fn test_css_query_select_map() {
301 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
302 let data = css_query_select_map(
303 r#"<html><body><ul class="list">Test</ul></body></html>"#,
304 &build_selectors(map),
305 );
306
307 assert!(!data.is_empty(), "CSS extraction failed",);
308}
309
310#[cfg(test)]
311#[tokio::test]
312async fn test_css_query_select_map_streamed_multi_join() {
313 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
314 let data = css_query_select_map_streamed(
315 r#"<html>
316 <body>
317 <ul class="list"><li>First</li></ul>
318 <ul class="sub-list"><li>Second</li></ul>
319 </body>
320 </html>"#,
321 &build_selectors(map),
322 )
323 .await;
324
325 assert!(!data.is_empty(), "CSS extraction failed");
326}
327
328#[cfg(test)]
329#[tokio::test]
330async fn test_xpath_query_select_map_streamed() {
331 let map = QueryCSSMap::from([(
332 "list",
333 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
334 )]);
335 let selectors = build_selectors(map);
336 let data = css_query_select_map_streamed(
337 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
338 &selectors,
339 )
340 .await;
341
342 assert!(!data.is_empty(), "Xpath extraction failed",);
343}