1use lazy_static::lazy_static;
9use scraper::Selector;
10use std::collections::HashMap;
11use std::sync::RwLock;
12
13use crate::types::{ParserError, ParserResult};
14
15pub struct CachedSelectors {
21 pub title: Selector,
23 pub meta: Selector,
24 pub link: Selector,
25 pub base: Selector,
26 pub html: Selector,
27
28 pub body: Selector,
30 pub article: Selector,
31 pub main: Selector,
32 pub main_role: Selector,
33
34 pub h1: Selector,
36 pub h2: Selector,
37 pub h3: Selector,
38 pub h4: Selector,
39 pub h5: Selector,
40 pub h6: Selector,
41
42 pub p: Selector,
44 pub blockquote: Selector,
45 pub pre: Selector,
46 pub pre_code: Selector,
47 pub code: Selector,
48
49 pub ul: Selector,
51 pub ol: Selector,
52 pub li: Selector,
53 pub dl: Selector,
54 pub dt: Selector,
55 pub dd: Selector,
56
57 pub table: Selector,
59 pub thead: Selector,
60 pub tbody: Selector,
61 pub tfoot: Selector,
62 pub tr: Selector,
63 pub th: Selector,
64 pub td: Selector,
65 pub caption: Selector,
66
67 pub a: Selector,
69 pub img: Selector,
70 pub picture: Selector,
71 pub source: Selector,
72 pub figure: Selector,
73 pub figcaption: Selector,
74
75 pub script: Selector,
77 pub style: Selector,
78 pub noscript: Selector,
79
80 pub nav: Selector,
82 pub header: Selector,
83 pub footer: Selector,
84 pub aside: Selector,
85
86 pub json_ld: Selector,
88 pub microdata: Selector,
89}
90
91impl CachedSelectors {
92 fn new() -> Self {
94 Self {
95 title: Selector::parse("title").unwrap(),
97 meta: Selector::parse("meta").unwrap(),
98 link: Selector::parse("link").unwrap(),
99 base: Selector::parse("base").unwrap(),
100 html: Selector::parse("html").unwrap(),
101
102 body: Selector::parse("body").unwrap(),
104 article: Selector::parse("article").unwrap(),
105 main: Selector::parse("main").unwrap(),
106 main_role: Selector::parse("[role=main]").unwrap(),
107
108 h1: Selector::parse("h1").unwrap(),
110 h2: Selector::parse("h2").unwrap(),
111 h3: Selector::parse("h3").unwrap(),
112 h4: Selector::parse("h4").unwrap(),
113 h5: Selector::parse("h5").unwrap(),
114 h6: Selector::parse("h6").unwrap(),
115
116 p: Selector::parse("p").unwrap(),
118 blockquote: Selector::parse("blockquote").unwrap(),
119 pre: Selector::parse("pre").unwrap(),
120 pre_code: Selector::parse("pre code").unwrap(),
121 code: Selector::parse("code").unwrap(),
122
123 ul: Selector::parse("ul").unwrap(),
125 ol: Selector::parse("ol").unwrap(),
126 li: Selector::parse("li").unwrap(),
127 dl: Selector::parse("dl").unwrap(),
128 dt: Selector::parse("dt").unwrap(),
129 dd: Selector::parse("dd").unwrap(),
130
131 table: Selector::parse("table").unwrap(),
133 thead: Selector::parse("thead").unwrap(),
134 tbody: Selector::parse("tbody").unwrap(),
135 tfoot: Selector::parse("tfoot").unwrap(),
136 tr: Selector::parse("tr").unwrap(),
137 th: Selector::parse("th").unwrap(),
138 td: Selector::parse("td").unwrap(),
139 caption: Selector::parse("caption").unwrap(),
140
141 a: Selector::parse("a").unwrap(),
143 img: Selector::parse("img").unwrap(),
144 picture: Selector::parse("picture").unwrap(),
145 source: Selector::parse("source").unwrap(),
146 figure: Selector::parse("figure").unwrap(),
147 figcaption: Selector::parse("figcaption").unwrap(),
148
149 script: Selector::parse("script").unwrap(),
151 style: Selector::parse("style").unwrap(),
152 noscript: Selector::parse("noscript").unwrap(),
153
154 nav: Selector::parse("nav").unwrap(),
156 header: Selector::parse("header").unwrap(),
157 footer: Selector::parse("footer").unwrap(),
158 aside: Selector::parse("aside").unwrap(),
159
160 json_ld: Selector::parse("script[type='application/ld+json']").unwrap(),
162 microdata: Selector::parse("[itemscope]").unwrap(),
163 }
164 }
165}
166
167lazy_static! {
169 pub static ref SELECTORS: CachedSelectors = CachedSelectors::new();
170}
171
172lazy_static! {
178 static ref SELECTOR_CACHE: RwLock<HashMap<String, Selector>> =
179 RwLock::new(HashMap::new());
180}
181
182pub fn get_or_create_selector(selector_str: &str) -> ParserResult<Selector> {
184 {
186 let cache = SELECTOR_CACHE.read().unwrap();
187 if let Some(sel) = cache.get(selector_str) {
188 return Ok(sel.clone());
189 }
190 }
191
192 let selector = parse_selector(selector_str)?;
194
195 {
197 let mut cache = SELECTOR_CACHE.write().unwrap();
198 cache.insert(selector_str.to_string(), selector.clone());
199 }
200
201 Ok(selector)
202}
203
204pub fn parse_selector(selector_str: &str) -> ParserResult<Selector> {
206 Selector::parse(selector_str)
207 .map_err(|_| ParserError::SelectorError(selector_str.to_string()))
208}
209
210pub fn try_parse_selector(selector_str: &str) -> Option<Selector> {
212 Selector::parse(selector_str).ok()
213}
214
215pub fn heading_selector(level: u8) -> &'static Selector {
221 match level {
222 1 => &SELECTORS.h1,
223 2 => &SELECTORS.h2,
224 3 => &SELECTORS.h3,
225 4 => &SELECTORS.h4,
226 5 => &SELECTORS.h5,
227 6 => &SELECTORS.h6,
228 _ => &SELECTORS.h1,
229 }
230}
231
232pub const CONTENT_SELECTORS: &[&str] = &[
234 "article",
235 "main",
236 "[role=main]",
237 ".content",
238 ".post-content",
239 ".entry-content",
240 ".article-content",
241 ".post-body",
242 ".article-body",
243 "#content",
244 "#main-content",
245];
246
247pub const BOILERPLATE_SELECTORS: &[&str] = &[
249 "script",
250 "style",
251 "noscript",
252 "iframe",
253 "object",
254 "embed",
255 "nav",
256 "header:not(article header)",
257 "footer:not(article footer)",
258 "aside",
259 ".sidebar",
260 ".navigation",
261 ".nav",
262 ".menu",
263 ".advertisement",
264 ".ad",
265 ".ads",
266 ".social-share",
267 ".social-buttons",
268 ".comments",
269 ".comment-form",
270 ".related-posts",
271 ".recommended",
272 "[role=navigation]",
273 "[role=banner]",
274 "[role=contentinfo]",
275 "[role=complementary]",
276 "[aria-hidden=true]",
277];
278
279pub const INLINE_ELEMENTS: &[&str] = &[
281 "a", "span", "em", "strong", "b", "i", "u", "s",
282 "mark", "small", "sub", "sup", "code", "kbd", "samp", "var",
283 "abbr", "cite", "dfn", "time", "q",
284];
285
286pub const BLOCK_ELEMENTS: &[&str] = &[
288 "p", "div", "h1", "h2", "h3", "h4", "h5", "h6",
289 "blockquote", "pre", "ul", "ol", "li", "dl", "dt", "dd",
290 "table", "tr", "th", "td", "article", "section", "aside",
291 "header", "footer", "nav", "main", "figure", "figcaption",
292 "address", "hr", "br",
293];
294
295pub fn attr_selector(element: &str, attr: &str, value: &str) -> String {
301 format!("{}[{}='{}']", element, attr, value)
302}
303
304pub fn attr_contains_selector(element: &str, attr: &str, value: &str) -> String {
306 format!("{}[{}*='{}']", element, attr, value)
307}
308
309pub fn attr_starts_with_selector(element: &str, attr: &str, value: &str) -> String {
311 format!("{}[{}^='{}']", element, attr, value)
312}
313
314pub fn class_selector(element: &str, class: &str) -> String {
316 format!("{}.{}", element, class)
317}
318
319pub fn id_selector(element: &str, id: &str) -> String {
321 format!("{}#{}", element, id)
322}
323
324pub fn descendant_selector(ancestor: &str, descendant: &str) -> String {
326 format!("{} {}", ancestor, descendant)
327}
328
329pub fn child_selector(parent: &str, child: &str) -> String {
331 format!("{} > {}", parent, child)
332}
333
334pub fn multi_selector(selectors: &[&str]) -> String {
336 selectors.join(", ")
337}
338
339pub fn meta_name_selector(name: &str) -> String {
345 format!("meta[name='{}']", name)
346}
347
348pub fn meta_property_selector(property: &str) -> String {
350 format!("meta[property='{}']", property)
351}
352
353pub fn link_rel_selector(rel: &str) -> String {
355 format!("link[rel='{}']", rel)
356}
357
358#[cfg(test)]
363mod tests {
364 use super::*;
365 use scraper::Html;
366
367 #[test]
368 fn test_cached_selectors_exist() {
369 let _ = &SELECTORS.title;
371 let _ = &SELECTORS.body;
372 let _ = &SELECTORS.h1;
373 }
374
375 #[test]
376 fn test_cached_selectors_work() {
377 let html = Html::parse_document("<html><body><h1>Test</h1></body></html>");
378 let h1 = html.select(&SELECTORS.h1).next();
379 assert!(h1.is_some());
380 }
381
382 #[test]
383 fn test_parse_selector_success() {
384 let sel = parse_selector("div.class").unwrap();
385 let html = Html::parse_document("<div class='class'>Test</div>");
386 assert!(html.select(&sel).next().is_some());
387 }
388
389 #[test]
390 fn test_parse_selector_failure() {
391 let result = parse_selector("div[[[invalid");
392 assert!(result.is_err());
393 if let Err(ParserError::SelectorError(s)) = result {
394 assert!(s.contains("invalid"));
395 }
396 }
397
398 #[test]
399 fn test_try_parse_selector() {
400 assert!(try_parse_selector("div").is_some());
401 assert!(try_parse_selector("div[[[").is_none());
402 }
403
404 #[test]
405 fn test_get_or_create_selector() {
406 let sel1 = get_or_create_selector("div.test-class").unwrap();
408 let sel2 = get_or_create_selector("div.test-class").unwrap();
410
411 let html = Html::parse_document("<div class='test-class'>Hello</div>");
413 assert!(html.select(&sel1).next().is_some());
414 assert!(html.select(&sel2).next().is_some());
415 }
416
417 #[test]
418 fn test_heading_selector() {
419 assert!(std::ptr::eq(heading_selector(1), &SELECTORS.h1));
420 assert!(std::ptr::eq(heading_selector(2), &SELECTORS.h2));
421 assert!(std::ptr::eq(heading_selector(6), &SELECTORS.h6));
422 assert!(std::ptr::eq(heading_selector(99), &SELECTORS.h1)); }
424
425 #[test]
426 fn test_attr_selector() {
427 let sel = attr_selector("input", "type", "text");
428 assert_eq!(sel, "input[type='text']");
429
430 let selector = parse_selector(&sel).unwrap();
431 let html = Html::parse_document("<input type='text'>");
432 assert!(html.select(&selector).next().is_some());
433 }
434
435 #[test]
436 fn test_attr_contains_selector() {
437 let sel = attr_contains_selector("a", "href", "example");
438 assert_eq!(sel, "a[href*='example']");
439 }
440
441 #[test]
442 fn test_attr_starts_with_selector() {
443 let sel = attr_starts_with_selector("a", "href", "https");
444 assert_eq!(sel, "a[href^='https']");
445 }
446
447 #[test]
448 fn test_class_selector() {
449 let sel = class_selector("div", "container");
450 assert_eq!(sel, "div.container");
451 }
452
453 #[test]
454 fn test_id_selector() {
455 let sel = id_selector("div", "main");
456 assert_eq!(sel, "div#main");
457 }
458
459 #[test]
460 fn test_descendant_selector() {
461 let sel = descendant_selector("article", "p");
462 assert_eq!(sel, "article p");
463 }
464
465 #[test]
466 fn test_child_selector() {
467 let sel = child_selector("ul", "li");
468 assert_eq!(sel, "ul > li");
469 }
470
471 #[test]
472 fn test_multi_selector() {
473 let sel = multi_selector(&["h1", "h2", "h3"]);
474 assert_eq!(sel, "h1, h2, h3");
475 }
476
477 #[test]
478 fn test_meta_name_selector() {
479 let sel = meta_name_selector("description");
480 assert_eq!(sel, "meta[name='description']");
481
482 let selector = parse_selector(&sel).unwrap();
483 let html = Html::parse_document("<meta name='description' content='Test'>");
484 assert!(html.select(&selector).next().is_some());
485 }
486
487 #[test]
488 fn test_meta_property_selector() {
489 let sel = meta_property_selector("og:title");
490 assert_eq!(sel, "meta[property='og:title']");
491 }
492
493 #[test]
494 fn test_link_rel_selector() {
495 let sel = link_rel_selector("canonical");
496 assert_eq!(sel, "link[rel='canonical']");
497 }
498
499 #[test]
500 fn test_boilerplate_selectors_valid() {
501 for sel_str in BOILERPLATE_SELECTORS {
503 assert!(
504 try_parse_selector(sel_str).is_some(),
505 "Invalid selector: {}", sel_str
506 );
507 }
508 }
509
510 #[test]
511 fn test_content_selectors_valid() {
512 for sel_str in CONTENT_SELECTORS {
513 assert!(
514 try_parse_selector(sel_str).is_some(),
515 "Invalid selector: {}", sel_str
516 );
517 }
518 }
519}