spider_util/selector_cache.rs
1//! # Selector Cache Module
2//!
3//! Provides a global cache for compiled CSS selectors to improve parsing performance.
4//!
5//! ## Overview
6//!
7//! The selector cache module implements a global caching mechanism for compiled
8//! CSS selectors used in HTML parsing. Since selector compilation can be expensive,
9//! especially when the same selectors are used repeatedly during crawling,
10//! this module caches compiled selectors to avoid repeated compilation overhead.
11//! The cache uses a thread-safe approach to allow concurrent access from multiple
12//! crawler threads.
13//!
14//! ## Key Components
15//!
16//! - **SELECTOR_CACHE**: Global static cache using Lazy initialization
17//! - **get_cached_selector**: Main function to retrieve or compile selectors
18//! - **prewarm_cache**: Function to pre-populate the cache with common selectors
19//! - **Thread Safety**: Uses RwLock for concurrent read/write access
20//!
21//! ## Performance Benefits
22//!
23//! The selector cache provides significant performance improvements when processing
24//! many pages with similar HTML structures. By caching compiled selectors,
25//! the system avoids the computational cost of parsing the same CSS selector
26//! expressions repeatedly. The cache uses a read-write lock to allow multiple
27//! concurrent readers while ensuring thread safety during cache updates.
28//!
29//! ## Example
30//!
31//! ```rust,ignore
32//! use spider_util::selector_cache::get_cached_selector;
33//!
34//! // Get a cached selector (compiles and caches if not already present)
35//! if let Some(selector) = get_cached_selector("div.content > p") {
36//! // Use the selector for parsing HTML
37//! // The selector is now cached for future use
38//! }
39//!
40//! // Pre-warm the cache with commonly used selectors
41//! spider_util::selector_cache::prewarm_cache();
42//! ```
43
44use once_cell::sync::Lazy;
45use parking_lot::RwLock;
46use scraper::Selector;
47use std::collections::HashMap;
48
49// Global selector cache to avoid repeated compilation
50static SELECTOR_CACHE: Lazy<RwLock<HashMap<String, Selector>>> =
51 Lazy::new(|| RwLock::new(HashMap::new()));
52
53/// Get a compiled selector from the cache or compile and store it if not present
54pub fn get_cached_selector(selector_str: &str) -> Option<Selector> {
55 {
56 let cache = SELECTOR_CACHE.read();
57 if let Some(cached) = cache.get(selector_str) {
58 return Some(cached.clone());
59 }
60 }
61
62 match Selector::parse(selector_str) {
63 Ok(selector) => {
64 {
65 let mut cache = SELECTOR_CACHE.write();
66 if let Some(cached) = cache.get(selector_str) {
67 return Some(cached.clone());
68 }
69 cache.insert(selector_str.to_string(), selector.clone());
70 }
71 Some(selector)
72 }
73 Err(_) => None,
74 }
75}
76
77/// Pre-warm the selector cache with commonly used selectors
78pub fn prewarm_cache() {
79 let common_selectors = vec![
80 "a[href]",
81 "link[href]",
82 "script[src]",
83 "img[src]",
84 "audio[src]",
85 "video[src]",
86 "source[src]",
87 "form[action]",
88 "iframe[src]",
89 "frame[src]",
90 "embed[src]",
91 "object[data]",
92 ];
93
94 for selector_str in common_selectors {
95 get_cached_selector(selector_str);
96 }
97}
98