halldyll_parser/
selector.rs

1//! CSS Selector utilities for halldyll-parser
2//!
3//! This module provides:
4//! - Pre-compiled, cached CSS selectors
5//! - Safe selector parsing with error handling
6//! - Common selector patterns for web scraping
7
8use lazy_static::lazy_static;
9use scraper::Selector;
10use std::collections::HashMap;
11use std::sync::RwLock;
12
13use crate::types::{ParserError, ParserResult};
14
15// ============================================================================
16// CACHED SELECTORS (PRE-COMPILED)
17// ============================================================================
18
19/// Pre-compiled selectors for common elements
20pub struct CachedSelectors {
21    // Metadata selectors
22    pub title: Selector,
23    pub meta: Selector,
24    pub link: Selector,
25    pub base: Selector,
26    pub html: Selector,
27    
28    // Content selectors
29    pub body: Selector,
30    pub article: Selector,
31    pub main: Selector,
32    pub main_role: Selector,
33    
34    // Heading selectors
35    pub h1: Selector,
36    pub h2: Selector,
37    pub h3: Selector,
38    pub h4: Selector,
39    pub h5: Selector,
40    pub h6: Selector,
41    
42    // Text content selectors
43    pub p: Selector,
44    pub blockquote: Selector,
45    pub pre: Selector,
46    pub pre_code: Selector,
47    pub code: Selector,
48    
49    // List selectors
50    pub ul: Selector,
51    pub ol: Selector,
52    pub li: Selector,
53    pub dl: Selector,
54    pub dt: Selector,
55    pub dd: Selector,
56    
57    // Table selectors
58    pub table: Selector,
59    pub thead: Selector,
60    pub tbody: Selector,
61    pub tfoot: Selector,
62    pub tr: Selector,
63    pub th: Selector,
64    pub td: Selector,
65    pub caption: Selector,
66    
67    // Link and image selectors
68    pub a: Selector,
69    pub img: Selector,
70    pub picture: Selector,
71    pub source: Selector,
72    pub figure: Selector,
73    pub figcaption: Selector,
74    
75    // Script/style (for removal)
76    pub script: Selector,
77    pub style: Selector,
78    pub noscript: Selector,
79    
80    // Structural
81    pub nav: Selector,
82    pub header: Selector,
83    pub footer: Selector,
84    pub aside: Selector,
85    
86    // Structured data
87    pub json_ld: Selector,
88    pub microdata: Selector,
89}
90
91impl CachedSelectors {
92    /// Create all cached selectors
93    fn new() -> Self {
94        Self {
95            // Metadata
96            title: Selector::parse("title").unwrap(),
97            meta: Selector::parse("meta").unwrap(),
98            link: Selector::parse("link").unwrap(),
99            base: Selector::parse("base").unwrap(),
100            html: Selector::parse("html").unwrap(),
101            
102            // Content
103            body: Selector::parse("body").unwrap(),
104            article: Selector::parse("article").unwrap(),
105            main: Selector::parse("main").unwrap(),
106            main_role: Selector::parse("[role=main]").unwrap(),
107            
108            // Headings
109            h1: Selector::parse("h1").unwrap(),
110            h2: Selector::parse("h2").unwrap(),
111            h3: Selector::parse("h3").unwrap(),
112            h4: Selector::parse("h4").unwrap(),
113            h5: Selector::parse("h5").unwrap(),
114            h6: Selector::parse("h6").unwrap(),
115            
116            // Text
117            p: Selector::parse("p").unwrap(),
118            blockquote: Selector::parse("blockquote").unwrap(),
119            pre: Selector::parse("pre").unwrap(),
120            pre_code: Selector::parse("pre code").unwrap(),
121            code: Selector::parse("code").unwrap(),
122            
123            // Lists
124            ul: Selector::parse("ul").unwrap(),
125            ol: Selector::parse("ol").unwrap(),
126            li: Selector::parse("li").unwrap(),
127            dl: Selector::parse("dl").unwrap(),
128            dt: Selector::parse("dt").unwrap(),
129            dd: Selector::parse("dd").unwrap(),
130            
131            // Tables
132            table: Selector::parse("table").unwrap(),
133            thead: Selector::parse("thead").unwrap(),
134            tbody: Selector::parse("tbody").unwrap(),
135            tfoot: Selector::parse("tfoot").unwrap(),
136            tr: Selector::parse("tr").unwrap(),
137            th: Selector::parse("th").unwrap(),
138            td: Selector::parse("td").unwrap(),
139            caption: Selector::parse("caption").unwrap(),
140            
141            // Links and images
142            a: Selector::parse("a").unwrap(),
143            img: Selector::parse("img").unwrap(),
144            picture: Selector::parse("picture").unwrap(),
145            source: Selector::parse("source").unwrap(),
146            figure: Selector::parse("figure").unwrap(),
147            figcaption: Selector::parse("figcaption").unwrap(),
148            
149            // Script/style
150            script: Selector::parse("script").unwrap(),
151            style: Selector::parse("style").unwrap(),
152            noscript: Selector::parse("noscript").unwrap(),
153            
154            // Structural
155            nav: Selector::parse("nav").unwrap(),
156            header: Selector::parse("header").unwrap(),
157            footer: Selector::parse("footer").unwrap(),
158            aside: Selector::parse("aside").unwrap(),
159            
160            // Structured data
161            json_ld: Selector::parse("script[type='application/ld+json']").unwrap(),
162            microdata: Selector::parse("[itemscope]").unwrap(),
163        }
164    }
165}
166
167// Global cached selectors instance
168lazy_static! {
169    pub static ref SELECTORS: CachedSelectors = CachedSelectors::new();
170}
171
172// ============================================================================
173// DYNAMIC SELECTOR CACHE
174// ============================================================================
175
176// Cache for dynamically created selectors
177lazy_static! {
178    static ref SELECTOR_CACHE: RwLock<HashMap<String, Selector>> = 
179        RwLock::new(HashMap::new());
180}
181
182/// Get or create a selector from the cache
183pub fn get_or_create_selector(selector_str: &str) -> ParserResult<Selector> {
184    // Check read cache first
185    {
186        let cache = SELECTOR_CACHE.read().unwrap();
187        if let Some(sel) = cache.get(selector_str) {
188            return Ok(sel.clone());
189        }
190    }
191    
192    // Parse and cache
193    let selector = parse_selector(selector_str)?;
194    
195    // Store in cache
196    {
197        let mut cache = SELECTOR_CACHE.write().unwrap();
198        cache.insert(selector_str.to_string(), selector.clone());
199    }
200    
201    Ok(selector)
202}
203
204/// Parse a CSS selector with proper error handling
205pub fn parse_selector(selector_str: &str) -> ParserResult<Selector> {
206    Selector::parse(selector_str)
207        .map_err(|_| ParserError::SelectorError(selector_str.to_string()))
208}
209
210/// Try to parse a selector, returning None on failure
211pub fn try_parse_selector(selector_str: &str) -> Option<Selector> {
212    Selector::parse(selector_str).ok()
213}
214
215// ============================================================================
216// SELECTOR UTILITIES
217// ============================================================================
218
219/// Build a selector for heading level
220pub fn heading_selector(level: u8) -> &'static Selector {
221    match level {
222        1 => &SELECTORS.h1,
223        2 => &SELECTORS.h2,
224        3 => &SELECTORS.h3,
225        4 => &SELECTORS.h4,
226        5 => &SELECTORS.h5,
227        6 => &SELECTORS.h6,
228        _ => &SELECTORS.h1,
229    }
230}
231
232/// Common content area selectors
233pub const CONTENT_SELECTORS: &[&str] = &[
234    "article",
235    "main",
236    "[role=main]",
237    ".content",
238    ".post-content",
239    ".entry-content",
240    ".article-content",
241    ".post-body",
242    ".article-body",
243    "#content",
244    "#main-content",
245];
246
247/// Selectors for elements to remove (boilerplate)
248pub const BOILERPLATE_SELECTORS: &[&str] = &[
249    "script",
250    "style",
251    "noscript",
252    "iframe",
253    "object",
254    "embed",
255    "nav",
256    "header:not(article header)",
257    "footer:not(article footer)",
258    "aside",
259    ".sidebar",
260    ".navigation",
261    ".nav",
262    ".menu",
263    ".advertisement",
264    ".ad",
265    ".ads",
266    ".social-share",
267    ".social-buttons",
268    ".comments",
269    ".comment-form",
270    ".related-posts",
271    ".recommended",
272    "[role=navigation]",
273    "[role=banner]",
274    "[role=contentinfo]",
275    "[role=complementary]",
276    "[aria-hidden=true]",
277];
278
279/// Selectors for inline elements that should preserve text
280pub const INLINE_ELEMENTS: &[&str] = &[
281    "a", "span", "em", "strong", "b", "i", "u", "s", 
282    "mark", "small", "sub", "sup", "code", "kbd", "samp", "var",
283    "abbr", "cite", "dfn", "time", "q",
284];
285
286/// Selectors for block elements that should add line breaks
287pub const BLOCK_ELEMENTS: &[&str] = &[
288    "p", "div", "h1", "h2", "h3", "h4", "h5", "h6",
289    "blockquote", "pre", "ul", "ol", "li", "dl", "dt", "dd",
290    "table", "tr", "th", "td", "article", "section", "aside",
291    "header", "footer", "nav", "main", "figure", "figcaption",
292    "address", "hr", "br",
293];
294
295// ============================================================================
296// SELECTOR BUILDERS
297// ============================================================================
298
299/// Build an attribute selector
300pub fn attr_selector(element: &str, attr: &str, value: &str) -> String {
301    format!("{}[{}='{}']", element, attr, value)
302}
303
304/// Build an attribute contains selector
305pub fn attr_contains_selector(element: &str, attr: &str, value: &str) -> String {
306    format!("{}[{}*='{}']", element, attr, value)
307}
308
309/// Build an attribute starts-with selector
310pub fn attr_starts_with_selector(element: &str, attr: &str, value: &str) -> String {
311    format!("{}[{}^='{}']", element, attr, value)
312}
313
314/// Build a class selector
315pub fn class_selector(element: &str, class: &str) -> String {
316    format!("{}.{}", element, class)
317}
318
319/// Build an ID selector
320pub fn id_selector(element: &str, id: &str) -> String {
321    format!("{}#{}", element, id)
322}
323
324/// Build a descendant selector
325pub fn descendant_selector(ancestor: &str, descendant: &str) -> String {
326    format!("{} {}", ancestor, descendant)
327}
328
329/// Build a child selector
330pub fn child_selector(parent: &str, child: &str) -> String {
331    format!("{} > {}", parent, child)
332}
333
334/// Build a multiple selector (OR)
335pub fn multi_selector(selectors: &[&str]) -> String {
336    selectors.join(", ")
337}
338
339// ============================================================================
340// META TAG SELECTORS
341// ============================================================================
342
343/// Create selector for meta tag by name
344pub fn meta_name_selector(name: &str) -> String {
345    format!("meta[name='{}']", name)
346}
347
348/// Create selector for meta tag by property (OG/Twitter)
349pub fn meta_property_selector(property: &str) -> String {
350    format!("meta[property='{}']", property)
351}
352
353/// Create selector for link by rel
354pub fn link_rel_selector(rel: &str) -> String {
355    format!("link[rel='{}']", rel)
356}
357
358// ============================================================================
359// TESTS
360// ============================================================================
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365    use scraper::Html;
366
367    #[test]
368    fn test_cached_selectors_exist() {
369        // Just accessing should not panic
370        let _ = &SELECTORS.title;
371        let _ = &SELECTORS.body;
372        let _ = &SELECTORS.h1;
373    }
374
375    #[test]
376    fn test_cached_selectors_work() {
377        let html = Html::parse_document("<html><body><h1>Test</h1></body></html>");
378        let h1 = html.select(&SELECTORS.h1).next();
379        assert!(h1.is_some());
380    }
381
382    #[test]
383    fn test_parse_selector_success() {
384        let sel = parse_selector("div.class").unwrap();
385        let html = Html::parse_document("<div class='class'>Test</div>");
386        assert!(html.select(&sel).next().is_some());
387    }
388
389    #[test]
390    fn test_parse_selector_failure() {
391        let result = parse_selector("div[[[invalid");
392        assert!(result.is_err());
393        if let Err(ParserError::SelectorError(s)) = result {
394            assert!(s.contains("invalid"));
395        }
396    }
397
398    #[test]
399    fn test_try_parse_selector() {
400        assert!(try_parse_selector("div").is_some());
401        assert!(try_parse_selector("div[[[").is_none());
402    }
403
404    #[test]
405    fn test_get_or_create_selector() {
406        // First call - creates
407        let sel1 = get_or_create_selector("div.test-class").unwrap();
408        // Second call - from cache
409        let sel2 = get_or_create_selector("div.test-class").unwrap();
410        
411        // Both should work identically
412        let html = Html::parse_document("<div class='test-class'>Hello</div>");
413        assert!(html.select(&sel1).next().is_some());
414        assert!(html.select(&sel2).next().is_some());
415    }
416
417    #[test]
418    fn test_heading_selector() {
419        assert!(std::ptr::eq(heading_selector(1), &SELECTORS.h1));
420        assert!(std::ptr::eq(heading_selector(2), &SELECTORS.h2));
421        assert!(std::ptr::eq(heading_selector(6), &SELECTORS.h6));
422        assert!(std::ptr::eq(heading_selector(99), &SELECTORS.h1)); // Invalid = h1
423    }
424
425    #[test]
426    fn test_attr_selector() {
427        let sel = attr_selector("input", "type", "text");
428        assert_eq!(sel, "input[type='text']");
429        
430        let selector = parse_selector(&sel).unwrap();
431        let html = Html::parse_document("<input type='text'>");
432        assert!(html.select(&selector).next().is_some());
433    }
434
435    #[test]
436    fn test_attr_contains_selector() {
437        let sel = attr_contains_selector("a", "href", "example");
438        assert_eq!(sel, "a[href*='example']");
439    }
440
441    #[test]
442    fn test_attr_starts_with_selector() {
443        let sel = attr_starts_with_selector("a", "href", "https");
444        assert_eq!(sel, "a[href^='https']");
445    }
446
447    #[test]
448    fn test_class_selector() {
449        let sel = class_selector("div", "container");
450        assert_eq!(sel, "div.container");
451    }
452
453    #[test]
454    fn test_id_selector() {
455        let sel = id_selector("div", "main");
456        assert_eq!(sel, "div#main");
457    }
458
459    #[test]
460    fn test_descendant_selector() {
461        let sel = descendant_selector("article", "p");
462        assert_eq!(sel, "article p");
463    }
464
465    #[test]
466    fn test_child_selector() {
467        let sel = child_selector("ul", "li");
468        assert_eq!(sel, "ul > li");
469    }
470
471    #[test]
472    fn test_multi_selector() {
473        let sel = multi_selector(&["h1", "h2", "h3"]);
474        assert_eq!(sel, "h1, h2, h3");
475    }
476
477    #[test]
478    fn test_meta_name_selector() {
479        let sel = meta_name_selector("description");
480        assert_eq!(sel, "meta[name='description']");
481        
482        let selector = parse_selector(&sel).unwrap();
483        let html = Html::parse_document("<meta name='description' content='Test'>");
484        assert!(html.select(&selector).next().is_some());
485    }
486
487    #[test]
488    fn test_meta_property_selector() {
489        let sel = meta_property_selector("og:title");
490        assert_eq!(sel, "meta[property='og:title']");
491    }
492
493    #[test]
494    fn test_link_rel_selector() {
495        let sel = link_rel_selector("canonical");
496        assert_eq!(sel, "link[rel='canonical']");
497    }
498
499    #[test]
500    fn test_boilerplate_selectors_valid() {
501        // Ensure all boilerplate selectors are valid CSS
502        for sel_str in BOILERPLATE_SELECTORS {
503            assert!(
504                try_parse_selector(sel_str).is_some(),
505                "Invalid selector: {}", sel_str
506            );
507        }
508    }
509
510    #[test]
511    fn test_content_selectors_valid() {
512        for sel_str in CONTENT_SELECTORS {
513            assert!(
514                try_parse_selector(sel_str).is_some(),
515                "Invalid selector: {}", sel_str
516            );
517        }
518    }
519}