Skip to main content

spider_util/
selector_cache.rs

1//! # Selector Cache Module
2//!
3//! Provides a global cache for compiled CSS selectors to improve parsing performance.
4//!
5//! ## Overview
6//!
7//! The selector cache module implements a global caching mechanism for compiled
8//! CSS selectors used in HTML parsing. Since selector compilation can be expensive,
9//! especially when the same selectors are used repeatedly during crawling,
10//! this module caches compiled selectors to avoid repeated compilation overhead.
11//! The cache uses a thread-safe approach to allow concurrent access from multiple
12//! crawler threads.
13//!
14//! ## Key Components
15//!
16//! - **SELECTOR_CACHE**: Global static cache using Lazy initialization
17//! - **get_cached_selector**: Main function to retrieve or compile selectors
18//! - **prewarm_cache**: Function to pre-populate the cache with common selectors
19//! - **Thread Safety**: Uses RwLock for concurrent read/write access
20//!
21//! ## Performance Benefits
22//!
23//! The selector cache provides significant performance improvements when processing
24//! many pages with similar HTML structures. By caching compiled selectors,
25//! the system avoids the computational cost of parsing the same CSS selector
26//! expressions repeatedly. The cache uses a read-write lock to allow multiple
27//! concurrent readers while ensuring thread safety during cache updates.
28//!
29//! ## Example
30//!
31//! ```rust,ignore
32//! use spider_util::selector_cache::get_cached_selector;
33//!
34//! // Get a cached selector (compiles and caches if not already present)
35//! if let Some(selector) = get_cached_selector("div.content > p") {
36//!     // Use the selector for parsing HTML
37//!     // The selector is now cached for future use
38//! }
39//!
40//! // Pre-warm the cache with commonly used selectors
41//! spider_util::selector_cache::prewarm_cache();
42//! ```
43
44use once_cell::sync::Lazy;
45use parking_lot::RwLock;
46use scraper::Selector;
47use std::collections::HashMap;
48
49// Global selector cache to avoid repeated compilation
50static SELECTOR_CACHE: Lazy<RwLock<HashMap<String, Selector>>> =
51    Lazy::new(|| RwLock::new(HashMap::new()));
52
53/// Get a compiled selector from the cache or compile and store it if not present
54pub fn get_cached_selector(selector_str: &str) -> Option<Selector> {
55    {
56        let cache = SELECTOR_CACHE.read();
57        if let Some(cached) = cache.get(selector_str) {
58            return Some(cached.clone());
59        }
60    }
61
62    match Selector::parse(selector_str) {
63        Ok(selector) => {
64            {
65                let mut cache = SELECTOR_CACHE.write();
66                if let Some(cached) = cache.get(selector_str) {
67                    return Some(cached.clone());
68                }
69                cache.insert(selector_str.to_string(), selector.clone());
70            }
71            Some(selector)
72        }
73        Err(_) => None,
74    }
75}
76
77/// Pre-warm the selector cache with commonly used selectors
78pub fn prewarm_cache() {
79    let common_selectors = vec![
80        "a[href]",
81        "link[href]",
82        "script[src]",
83        "img[src]",
84        "audio[src]",
85        "video[src]",
86        "source[src]",
87        "form[action]",
88        "iframe[src]",
89        "frame[src]",
90        "embed[src]",
91        "object[data]",
92    ];
93
94    for selector_str in common_selectors {
95        get_cached_selector(selector_str);
96    }
97}
98