1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10#[derive(Default, Debug, Clone)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub struct DocumentSelectors<K> {
14 pub css: HashMap<K, Vec<Selector>>,
16 pub xpath: HashMap<K, Vec<String>>,
18}
19
20#[cfg(feature = "transformations")]
21pub use spider_transformations;
22
23type CSSQueryMap = HashMap<String, Vec<String>>;
25
26lazy_static! {
27 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
29}
30
31fn is_valid_xpath(expression: &str) -> bool {
33 match XPATH_FACTORY.build(expression) {
34 Ok(Some(_)) => true,
35 Ok(None) => false,
36 Err(_) => false,
37 }
38}
39
40pub async fn css_query_select_map_streamed<K>(
42 html: &str,
43 selectors: &DocumentSelectors<K>,
44) -> CSSQueryMap
45where
46 K: AsRef<str> + Eq + Hash + Sized,
47{
48 let mut map: CSSQueryMap = HashMap::new();
49
50 if !selectors.css.is_empty() {
51 let mut stream = tokio_stream::iter(&selectors.css);
52 let fragment = Box::new(Html::parse_document(html));
53
54 while let Some(selector) = stream.next().await {
55 for s in selector.1 {
56 for element in fragment.select(s) {
57 process_selector::<K>(element, selector.0, &mut map);
58 }
59 }
60 }
61 }
62
63 if !selectors.xpath.is_empty() {
64 if let Ok(package) = parser::parse(html) {
65 let document = Box::new(package.as_document());
66
67 for selector in selectors.xpath.iter() {
68 for s in selector.1 {
69 if let Ok(value) = evaluate_xpath(&document, s) {
70 let text = value.into_string();
71
72 if !text.is_empty() {
73 match map.entry(selector.0.as_ref().to_string()) {
74 Entry::Occupied(mut entry) => entry.get_mut().push(text),
75 Entry::Vacant(entry) => {
76 entry.insert(vec![text]);
77 }
78 }
79 }
80 };
81 }
82 }
83 };
84 }
85
86 for items in map.values_mut() {
87 items.dedup();
88 }
89
90 map
91}
92
93pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
95where
96 K: AsRef<str> + Eq + Hash + Sized,
97{
98 let mut map: CSSQueryMap = HashMap::new();
99
100 if !selectors.css.is_empty() {
101 let fragment = Box::new(Html::parse_document(html));
102
103 for selector in selectors.css.iter() {
104 for s in selector.1 {
105 for element in fragment.select(s) {
106 process_selector::<K>(element, selector.0, &mut map);
107 }
108 }
109 }
110 }
111
112 if !selectors.xpath.is_empty() {
113 if let Ok(package) = parser::parse(html) {
114 let document = package.as_document();
115
116 for selector in selectors.xpath.iter() {
117 for s in selector.1 {
118 if let Ok(value) = evaluate_xpath(&document, s) {
119 let text = value.into_string();
120
121 if !text.is_empty() {
122 match map.entry(selector.0.as_ref().to_string()) {
123 Entry::Occupied(mut entry) => entry.get_mut().push(text),
124 Entry::Vacant(entry) => {
125 entry.insert(vec![text]);
126 }
127 }
128 }
129 };
130 }
131 }
132 };
133 }
134
135 map
136}
137
138fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
140where
141 K: AsRef<str> + Eq + Hash + Sized,
142{
143 let name = name.as_ref();
144 let element_name = element.value().name();
145
146 let text = if element_name == "meta" {
147 element.attr("content").unwrap_or_default().into()
148 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
149 match element.attr(if element_name == "link" {
150 "href"
151 } else {
152 "src"
153 }) {
154 Some(href) => href.into(),
155 _ => clean_element_text(&element),
156 }
157 } else if element_name == "img" || element_name == "source" {
158 let mut img_text = String::new();
159
160 if let Some(src) = element.attr("src") {
161 if !src.is_empty() {
162 img_text.push('[');
163 img_text.push_str(src.trim());
164 img_text.push(']');
165 }
166 }
167 if let Some(alt) = element.attr("alt") {
168 if !alt.is_empty() {
169 if img_text.is_empty() {
170 img_text.push_str(alt);
171 } else {
172 img_text.push('(');
173 img_text.push('"');
174 img_text.push_str(alt);
175 img_text.push('"');
176 img_text.push(')');
177 }
178 }
179 }
180
181 img_text
182 } else {
183 clean_element_text(&element)
184 };
185
186 if !text.is_empty() {
187 match map.entry(name.to_string()) {
188 Entry::Occupied(mut entry) => entry.get_mut().push(text),
189 Entry::Vacant(entry) => {
190 entry.insert(vec![text]);
191 }
192 }
193 }
194}
195
196pub fn clean_element_text(element: &ElementRef) -> String {
198 element.text().collect::<Vec<_>>().join(" ")
199}
200
201pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
203where
204 K: AsRef<str> + Eq + Hash + Clone + Debug,
205 V: AsRef<str> + Debug + AsRef<str>,
206 S: IntoIterator<Item = V>,
207{
208 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
209 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
210
211 for (key, selector_set) in selectors {
212 let mut selectors_vec = Vec::new();
213 let mut selectors_vec_xpath = Vec::new();
214
215 for selector_str in selector_set {
216 match Selector::parse(selector_str.as_ref()) {
217 Ok(selector) => selectors_vec.push(selector),
218 Err(err) => {
219 if is_valid_xpath(selector_str.as_ref()) {
220 selectors_vec_xpath.push(selector_str.as_ref().to_string())
221 } else {
222 warn!(
223 "{}",
224 format!(
225 "Failed to parse selector '{}': {:?}",
226 selector_str.as_ref(),
227 err
228 ),
229 )
230 }
231 }
232 }
233 }
234
235 let has_css_selectors = !selectors_vec.is_empty();
236 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
237
238 if has_css_selectors && !has_xpath_selectors {
239 valid_selectors.insert(key, selectors_vec);
240 } else if !has_css_selectors && has_xpath_selectors {
241 valid_selectors_xpath.insert(key, selectors_vec_xpath);
242 } else {
243 if has_css_selectors {
244 valid_selectors.insert(key.clone(), selectors_vec);
245 }
246 if has_xpath_selectors {
247 valid_selectors_xpath.insert(key, selectors_vec_xpath);
248 }
249 }
250 }
251
252 DocumentSelectors {
253 css: valid_selectors,
254 xpath: valid_selectors_xpath,
255 }
256}
257
258#[cfg(not(feature = "indexset"))]
260pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
261where
262 K: AsRef<str> + Eq + Hash + Clone + Debug,
263 V: AsRef<str> + Debug + AsRef<str>,
264{
265 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
266}
267
268#[cfg(feature = "indexset")]
270pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
271where
272 K: AsRef<str> + Eq + Hash + Clone + Debug,
273 V: AsRef<str> + Debug + AsRef<str>,
274{
275 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
276}
277
278#[cfg(not(feature = "indexset"))]
279pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
280#[cfg(feature = "indexset")]
281pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
282#[cfg(not(feature = "indexset"))]
283pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
284#[cfg(feature = "indexset")]
285pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
286
287#[cfg(test)]
288#[tokio::test]
289async fn test_css_query_select_map_streamed() {
290 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
291
292 let data = css_query_select_map_streamed(
293 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
294 &build_selectors(map),
295 )
296 .await;
297
298 assert!(!data.is_empty(), "CSS extraction failed",);
299}
300
301#[test]
302fn test_css_query_select_map() {
303 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
304 let data = css_query_select_map(
305 r#"<html><body><ul class="list">Test</ul></body></html>"#,
306 &build_selectors(map),
307 );
308
309 assert!(!data.is_empty(), "CSS extraction failed",);
310}
311
312#[cfg(test)]
313#[tokio::test]
314async fn test_css_query_select_map_streamed_multi_join() {
315 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
316 let data = css_query_select_map_streamed(
317 r#"<html>
318 <body>
319 <ul class="list"><li>First</li></ul>
320 <ul class="sub-list"><li>Second</li></ul>
321 </body>
322 </html>"#,
323 &build_selectors(map),
324 )
325 .await;
326
327 assert!(!data.is_empty(), "CSS extraction failed");
328}
329
330#[cfg(test)]
331#[tokio::test]
332async fn test_xpath_query_select_map_streamed() {
333 let map = QueryCSSMap::from([(
334 "list",
335 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
336 )]);
337 let selectors = build_selectors(map);
338 let data = css_query_select_map_streamed(
339 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
340 &selectors,
341 )
342 .await;
343
344 assert!(!data.is_empty(), "Xpath extraction failed",);
345}