1use hashbrown::{hash_map::Entry, HashMap};
2use lazy_static::lazy_static;
3use log::{self, warn};
4use scraper::{ElementRef, Html, Selector};
5use std::{fmt::Debug, hash::Hash};
6use sxd_document::parser;
7use sxd_xpath::evaluate_xpath;
8use tokio_stream::StreamExt;
9
10#[derive(Default, Debug, Clone)]
12pub struct DocumentSelectors<K> {
13 pub css: HashMap<K, Vec<Selector>>,
15 pub xpath: HashMap<K, Vec<String>>,
17}
18
19type CSSQueryMap = HashMap<String, Vec<String>>;
21
22lazy_static! {
23 static ref XPATH_FACTORY: sxd_xpath::Factory = sxd_xpath::Factory::new();
25}
26
27fn is_valid_xpath(expression: &str) -> bool {
29 match XPATH_FACTORY.build(expression) {
30 Ok(Some(_)) => true,
31 Ok(None) => false,
32 Err(_) => false,
33 }
34}
35
36pub async fn css_query_select_map_streamed<K>(
38 html: &str,
39 selectors: &DocumentSelectors<K>,
40) -> CSSQueryMap
41where
42 K: AsRef<str> + Eq + Hash + Sized,
43{
44 let mut map: CSSQueryMap = HashMap::new();
45
46 if !selectors.css.is_empty() {
47 let mut stream = tokio_stream::iter(&selectors.css);
48 let fragment = Box::new(Html::parse_document(html));
49
50 while let Some(selector) = stream.next().await {
51 for s in selector.1 {
52 for element in fragment.select(s) {
53 process_selector::<K>(element, selector.0, &mut map);
54 }
55 }
56 }
57 }
58
59 if !selectors.xpath.is_empty() {
60 if let Ok(package) = parser::parse(html) {
61 let document = Box::new(package.as_document());
62
63 for selector in selectors.xpath.iter() {
64 for s in selector.1 {
65 if let Ok(value) = evaluate_xpath(&document, s) {
66 let text = value.into_string();
67
68 if !text.is_empty() {
69 match map.entry(selector.0.as_ref().to_string()) {
70 Entry::Occupied(mut entry) => entry.get_mut().push(text),
71 Entry::Vacant(entry) => {
72 entry.insert(vec![text]);
73 }
74 }
75 }
76 };
77 }
78 }
79 };
80 }
81
82 for items in map.values_mut() {
83 items.dedup();
84 }
85
86 map
87}
88
89pub fn css_query_select_map<K>(html: &str, selectors: &DocumentSelectors<K>) -> CSSQueryMap
91where
92 K: AsRef<str> + Eq + Hash + Sized,
93{
94 let mut map: CSSQueryMap = HashMap::new();
95
96 if !selectors.css.is_empty() {
97 let fragment = Box::new(Html::parse_document(html));
98
99 for selector in selectors.css.iter() {
100 for s in selector.1 {
101 for element in fragment.select(s) {
102 process_selector::<K>(element, selector.0, &mut map);
103 }
104 }
105 }
106 }
107
108 if !selectors.xpath.is_empty() {
109 if let Ok(package) = parser::parse(html) {
110 let document = package.as_document();
111
112 for selector in selectors.xpath.iter() {
113 for s in selector.1 {
114 if let Ok(value) = evaluate_xpath(&document, s) {
115 let text = value.into_string();
116
117 if !text.is_empty() {
118 match map.entry(selector.0.as_ref().to_string()) {
119 Entry::Occupied(mut entry) => entry.get_mut().push(text),
120 Entry::Vacant(entry) => {
121 entry.insert(vec![text]);
122 }
123 }
124 }
125 };
126 }
127 }
128 };
129 }
130
131 map
132}
133
134fn process_selector<K>(element: ElementRef, name: &K, map: &mut CSSQueryMap)
136where
137 K: AsRef<str> + Eq + Hash + Sized,
138{
139 let name = name.as_ref();
140 let element_name = element.value().name();
141
142 let text = if element_name == "meta" {
143 element.attr("content").unwrap_or_default().into()
144 } else if element_name == "link" || element_name == "script" || element_name == "styles" {
145 match element.attr(if element_name == "link" {
146 "href"
147 } else {
148 "src"
149 }) {
150 Some(href) => href.into(),
151 _ => clean_element_text(&element),
152 }
153 } else if element_name == "img" || element_name == "source" {
154 let mut img_text = String::new();
155
156 if let Some(src) = element.attr("src") {
157 if !src.is_empty() {
158 img_text.push('[');
159 img_text.push_str(src.trim());
160 img_text.push(']');
161 }
162 }
163 if let Some(alt) = element.attr("alt") {
164 if !alt.is_empty() {
165 if img_text.is_empty() {
166 img_text.push_str(alt);
167 } else {
168 img_text.push('(');
169 img_text.push('"');
170 img_text.push_str(alt);
171 img_text.push('"');
172 img_text.push(')');
173 }
174 }
175 }
176
177 img_text
178 } else {
179 clean_element_text(&element)
180 };
181
182 if !text.is_empty() {
183 match map.entry(name.to_string()) {
184 Entry::Occupied(mut entry) => entry.get_mut().push(text),
185 Entry::Vacant(entry) => {
186 entry.insert(vec![text]);
187 }
188 }
189 }
190}
191
192pub fn clean_element_text(element: &ElementRef) -> String {
194 element.text().collect::<Vec<_>>().join(" ")
195}
196
197pub fn build_selectors_base<K, V, S>(selectors: HashMap<K, S>) -> DocumentSelectors<K>
199where
200 K: AsRef<str> + Eq + Hash + Clone + Debug,
201 V: AsRef<str> + Debug + AsRef<str>,
202 S: IntoIterator<Item = V>,
203{
204 let mut valid_selectors: HashMap<K, Vec<Selector>> = HashMap::new();
205 let mut valid_selectors_xpath: HashMap<K, Vec<String>> = HashMap::new();
206
207 for (key, selector_set) in selectors {
208 let mut selectors_vec = Vec::new();
209 let mut selectors_vec_xpath = Vec::new();
210
211 for selector_str in selector_set {
212 match Selector::parse(selector_str.as_ref()) {
213 Ok(selector) => selectors_vec.push(selector),
214 Err(err) => {
215 if is_valid_xpath(selector_str.as_ref()) {
216 selectors_vec_xpath.push(selector_str.as_ref().to_string())
217 } else {
218 warn!(
219 "{}",
220 format!(
221 "Failed to parse selector '{}': {:?}",
222 selector_str.as_ref(),
223 err
224 ),
225 )
226 }
227 }
228 }
229 }
230
231 let has_css_selectors = !selectors_vec.is_empty();
232 let has_xpath_selectors = !selectors_vec_xpath.is_empty();
233
234 if has_css_selectors && !has_xpath_selectors {
235 valid_selectors.insert(key, selectors_vec);
236 } else if !has_css_selectors && has_xpath_selectors {
237 valid_selectors_xpath.insert(key, selectors_vec_xpath);
238 } else {
239 if has_css_selectors {
240 valid_selectors.insert(key.clone(), selectors_vec);
241 }
242 if has_xpath_selectors {
243 valid_selectors_xpath.insert(key, selectors_vec_xpath);
244 }
245 }
246 }
247
248 DocumentSelectors {
249 css: valid_selectors,
250 xpath: valid_selectors_xpath,
251 }
252}
253
254#[cfg(not(feature = "indexset"))]
256pub fn build_selectors<K, V>(selectors: HashMap<K, hashbrown::HashSet<V>>) -> DocumentSelectors<K>
257where
258 K: AsRef<str> + Eq + Hash + Clone + Debug,
259 V: AsRef<str> + Debug + AsRef<str>,
260{
261 build_selectors_base::<K, V, hashbrown::HashSet<V>>(selectors)
262}
263
264#[cfg(feature = "indexset")]
266pub fn build_selectors<K, V>(selectors: HashMap<K, indexmap::IndexSet<V>>) -> DocumentSelectors<K>
267where
268 K: AsRef<str> + Eq + Hash + Clone + Debug,
269 V: AsRef<str> + Debug + AsRef<str>,
270{
271 build_selectors_base::<K, V, indexmap::IndexSet<V>>(selectors)
272}
273
274#[cfg(not(feature = "indexset"))]
275pub type QueryCSSSelectSet<'a> = hashbrown::HashSet<&'a str>;
276#[cfg(feature = "indexset")]
277pub type QueryCSSSelectSet<'a> = indexmap::IndexSet<&'a str>;
278#[cfg(not(feature = "indexset"))]
279pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
280#[cfg(feature = "indexset")]
281pub type QueryCSSMap<'a> = HashMap<&'a str, QueryCSSSelectSet<'a>>;
282
283#[cfg(test)]
284#[tokio::test]
285async fn test_css_query_select_map_streamed() {
286 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
287
288 let data = css_query_select_map_streamed(
289 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
290 &build_selectors(map),
291 )
292 .await;
293
294 assert!(!data.is_empty(), "CSS extraction failed",);
295}
296
297#[test]
298fn test_css_query_select_map() {
299 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
300 let data = css_query_select_map(
301 r#"<html><body><ul class="list">Test</ul></body></html>"#,
302 &build_selectors(map),
303 );
304
305 assert!(!data.is_empty(), "CSS extraction failed",);
306}
307
308#[cfg(test)]
309#[tokio::test]
310async fn test_css_query_select_map_streamed_multi_join() {
311 let map = QueryCSSMap::from([("list", QueryCSSSelectSet::from([".list", ".sub-list"]))]);
312 let data = css_query_select_map_streamed(
313 r#"<html>
314 <body>
315 <ul class="list"><li>First</li></ul>
316 <ul class="sub-list"><li>Second</li></ul>
317 </body>
318 </html>"#,
319 &build_selectors(map),
320 )
321 .await;
322
323 assert!(!data.is_empty(), "CSS extraction failed");
324}
325
326#[cfg(test)]
327#[tokio::test]
328async fn test_xpath_query_select_map_streamed() {
329 let map = QueryCSSMap::from([(
330 "list",
331 QueryCSSSelectSet::from(["//*[@class='list']", "//*[@class='sub-list']"]),
332 )]);
333 let selectors = build_selectors(map);
334 let data = css_query_select_map_streamed(
335 r#"<html><body><ul class="list"><li>Test</li></ul></body></html>"#,
336 &selectors,
337 )
338 .await;
339
340 assert!(!data.is_empty(), "Xpath extraction failed",);
341}