halldyll_robots/
cache.rs

1//! Cache - Robots.txt caching with TTL and optional persistence
2
3use crate::types::{RobotsCacheKey, RobotsPolicy};
4use dashmap::DashMap;
5use std::path::Path;
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::Arc;
8use std::time::Duration;
9use tokio::fs;
10use tokio::io::{AsyncReadExt, AsyncWriteExt};
11use tracing::{debug, info};
12
13/// Maximum cache TTL (24 hours per RFC 9309)
14pub const MAX_CACHE_TTL: Duration = Duration::from_secs(24 * 60 * 60);
15
16/// Default cache TTL (1 hour)
17pub const DEFAULT_CACHE_TTL: Duration = Duration::from_secs(60 * 60);
18
19/// Cache statistics
20#[derive(Debug, Default)]
21pub struct CacheStats {
22    /// Number of cache hits
23    pub hits: AtomicU64,
24    /// Number of cache misses
25    pub misses: AtomicU64,
26    /// Number of cache evictions
27    pub evictions: AtomicU64,
28    /// Number of cache entries
29    pub entries: AtomicU64,
30}
31
32impl CacheStats {
33    /// Record a cache hit
34    pub fn record_hit(&self) {
35        self.hits.fetch_add(1, Ordering::Relaxed);
36    }
37
38    /// Record a cache miss
39    pub fn record_miss(&self) {
40        self.misses.fetch_add(1, Ordering::Relaxed);
41    }
42
43    /// Record a cache eviction
44    pub fn record_eviction(&self) {
45        self.evictions.fetch_add(1, Ordering::Relaxed);
46    }
47
48    /// Get hit rate
49    pub fn hit_rate(&self) -> f64 {
50        let hits = self.hits.load(Ordering::Relaxed);
51        let misses = self.misses.load(Ordering::Relaxed);
52        let total = hits + misses;
53        if total == 0 {
54            0.0
55        } else {
56            hits as f64 / total as f64
57        }
58    }
59
60    /// Get snapshot of stats
61    pub fn snapshot(&self) -> CacheStatsSnapshot {
62        CacheStatsSnapshot {
63            hits: self.hits.load(Ordering::Relaxed),
64            misses: self.misses.load(Ordering::Relaxed),
65            evictions: self.evictions.load(Ordering::Relaxed),
66            entries: self.entries.load(Ordering::Relaxed),
67        }
68    }
69}
70
71/// Snapshot of cache statistics
72#[derive(Debug, Clone)]
73pub struct CacheStatsSnapshot {
74    /// Number of cache hits
75    pub hits: u64,
76    /// Number of cache misses
77    pub misses: u64,
78    /// Number of cache evictions
79    pub evictions: u64,
80    /// Number of cache entries
81    pub entries: u64,
82}
83
84/// Robots.txt cache
85pub struct RobotsCache {
86    /// In-memory cache
87    cache: Arc<DashMap<RobotsCacheKey, RobotsPolicy>>,
88    /// Default TTL for new entries
89    default_ttl: Duration,
90    /// Optional persistence directory
91    persist_dir: Option<String>,
92    /// Cache statistics
93    stats: Arc<CacheStats>,
94}
95
96impl Default for RobotsCache {
97    fn default() -> Self {
98        Self::new(DEFAULT_CACHE_TTL)
99    }
100}
101
102impl RobotsCache {
103    /// Create a new in-memory cache
104    pub fn new(default_ttl: Duration) -> Self {
105        // Enforce max TTL
106        let default_ttl = default_ttl.min(MAX_CACHE_TTL);
107        
108        Self {
109            cache: Arc::new(DashMap::new()),
110            default_ttl,
111            persist_dir: None,
112            stats: Arc::new(CacheStats::default()),
113        }
114    }
115
116    /// Create a cache with file persistence
117    pub fn with_persistence(default_ttl: Duration, persist_dir: &str) -> Self {
118        let default_ttl = default_ttl.min(MAX_CACHE_TTL);
119        
120        Self {
121            cache: Arc::new(DashMap::new()),
122            default_ttl,
123            persist_dir: Some(persist_dir.to_string()),
124            stats: Arc::new(CacheStats::default()),
125        }
126    }
127
128    /// Get the default TTL
129    pub fn default_ttl(&self) -> Duration {
130        self.default_ttl
131    }
132
133    /// Get cache statistics
134    pub fn stats(&self) -> Arc<CacheStats> {
135        self.stats.clone()
136    }
137
138    /// Get a policy from cache if not expired
139    pub fn get(&self, key: &RobotsCacheKey) -> Option<RobotsPolicy> {
140        if let Some(entry) = self.cache.get(key) {
141            if !entry.is_expired() {
142                self.stats.record_hit();
143                debug!("Cache hit for {}", key.robots_url());
144                return Some(entry.clone());
145            }
146            // Expired, remove it
147            drop(entry);
148            self.cache.remove(key);
149            self.stats.record_eviction();
150        }
151        
152        self.stats.record_miss();
153        debug!("Cache miss for {}", key.robots_url());
154        None
155    }
156
157    /// Insert a policy into cache
158    pub fn insert(&self, key: RobotsCacheKey, policy: RobotsPolicy) {
159        let old = self.cache.insert(key.clone(), policy);
160        if old.is_none() {
161            self.stats.entries.fetch_add(1, Ordering::Relaxed);
162        }
163        debug!("Cached robots.txt for {}", key.robots_url());
164    }
165
166    /// Remove a policy from cache
167    pub fn remove(&self, key: &RobotsCacheKey) -> Option<RobotsPolicy> {
168        let removed = self.cache.remove(key).map(|(_, v)| v);
169        if removed.is_some() {
170            self.stats.entries.fetch_sub(1, Ordering::Relaxed);
171            self.stats.record_eviction();
172        }
173        removed
174    }
175
176    /// Clear expired entries
177    pub fn evict_expired(&self) -> usize {
178        let mut evicted = 0;
179        self.cache.retain(|_, policy| {
180            if policy.is_expired() {
181                evicted += 1;
182                false
183            } else {
184                true
185            }
186        });
187        
188        if evicted > 0 {
189            self.stats.entries.fetch_sub(evicted as u64, Ordering::Relaxed);
190            self.stats.evictions.fetch_add(evicted as u64, Ordering::Relaxed);
191            info!("Evicted {} expired robots.txt entries", evicted);
192        }
193        
194        evicted
195    }
196
197    /// Clear all entries
198    pub fn clear(&self) {
199        let count = self.cache.len();
200        self.cache.clear();
201        self.stats.entries.store(0, Ordering::Relaxed);
202        info!("Cleared {} robots.txt cache entries", count);
203    }
204
205    /// Get number of entries
206    pub fn len(&self) -> usize {
207        self.cache.len()
208    }
209
210    /// Check if cache is empty
211    pub fn is_empty(&self) -> bool {
212        self.cache.is_empty()
213    }
214
215    /// List all cached domains
216    pub fn domains(&self) -> Vec<String> {
217        self.cache
218            .iter()
219            .map(|entry| entry.key().authority.clone())
220            .collect()
221    }
222
223    /// Save cache to disk (if persistence enabled)
224    pub async fn save_to_disk(&self) -> std::io::Result<usize> {
225        let persist_dir = match &self.persist_dir {
226            Some(dir) => dir,
227            None => return Ok(0),
228        };
229
230        // Create directory if needed
231        fs::create_dir_all(persist_dir).await?;
232
233        let mut saved = 0;
234        for entry in self.cache.iter() {
235            let key = entry.key();
236            let policy = entry.value();
237            
238            // Skip expired entries
239            if policy.is_expired() {
240                continue;
241            }
242
243            let filename = self.cache_filename(key);
244            let filepath = Path::new(persist_dir).join(&filename);
245            
246            // Serialize to JSON
247            if let Ok(json) = serde_json::to_string_pretty(&CacheEntry {
248                key: key.clone(),
249                groups: policy.groups.clone(),
250                sitemaps: policy.sitemaps.clone(),
251                content_size: policy.content_size,
252                ttl_secs: policy.ttl().as_secs(),
253            }) {
254                if let Ok(mut file) = fs::File::create(&filepath).await {
255                    if file.write_all(json.as_bytes()).await.is_ok() {
256                        saved += 1;
257                    }
258                }
259            }
260        }
261
262        info!("Saved {} robots.txt entries to disk", saved);
263        Ok(saved)
264    }
265
266    /// Load cache from disk (if persistence enabled)
267    pub async fn load_from_disk(&self) -> std::io::Result<usize> {
268        let persist_dir = match &self.persist_dir {
269            Some(dir) => dir,
270            None => return Ok(0),
271        };
272
273        let path = Path::new(persist_dir);
274        if !path.exists() {
275            return Ok(0);
276        }
277
278        let mut loaded = 0;
279        let mut entries = fs::read_dir(persist_dir).await?;
280        
281        while let Some(entry) = entries.next_entry().await? {
282            let filepath = entry.path();
283            if filepath.extension().is_some_and(|ext| ext == "json") {
284                if let Ok(mut file) = fs::File::open(&filepath).await {
285                    let mut content = String::new();
286                    if file.read_to_string(&mut content).await.is_ok() {
287                                if let Ok(cache_entry) = serde_json::from_str::<CacheEntry>(&content) {
288                            // Reconstruct policy
289                            let ttl = Duration::from_secs(cache_entry.ttl_secs);
290                            if ttl > Duration::ZERO {
291                                let now = std::time::SystemTime::now()
292                                    .duration_since(std::time::UNIX_EPOCH)
293                                    .unwrap_or_default()
294                                    .as_millis() as u64;
295                                let policy = RobotsPolicy {
296                                    fetched_at_ms: now,
297                                    expires_at_ms: now + ttl.as_millis() as u64,
298                                    fetch_status: crate::types::FetchStatus::Success,
299                                    groups: cache_entry.groups,
300                                    sitemaps: cache_entry.sitemaps,
301                                    content_size: cache_entry.content_size,
302                                    etag: None,
303                                    last_modified: None,
304                                };
305                                self.insert(cache_entry.key, policy);
306                                loaded += 1;
307                            }
308                        }
309                    }
310                }
311            }
312        }
313
314        info!("Loaded {} robots.txt entries from disk", loaded);
315        Ok(loaded)
316    }
317
318    /// Generate a filename for a cache key
319    fn cache_filename(&self, key: &RobotsCacheKey) -> String {
320        // Use base64 encoding for safe filenames
321        let combined = format!("{}_{}", key.scheme, key.authority);
322        let encoded = base64_encode(&combined);
323        format!("{}.json", encoded)
324    }
325}
326
327/// Entry for disk persistence
328#[derive(serde::Serialize, serde::Deserialize)]
329struct CacheEntry {
330    key: RobotsCacheKey,
331    groups: Vec<crate::types::Group>,
332    sitemaps: Vec<String>,
333    content_size: usize,
334    ttl_secs: u64,
335}
336
337/// Simple base64-like encoding for filenames
338fn base64_encode(s: &str) -> String {
339    // Simple hex encoding for safety
340    s.bytes()
341        .map(|b| format!("{:02x}", b))
342        .collect()
343}
344
345/// Parse Cache-Control header for TTL
346pub fn parse_cache_control(header: &str) -> Option<Duration> {
347    for directive in header.split(',') {
348        let directive = directive.trim();
349        if let Some(value) = directive.strip_prefix("max-age=") {
350            if let Ok(secs) = value.trim().parse::<u64>() {
351                return Some(Duration::from_secs(secs).min(MAX_CACHE_TTL));
352            }
353        }
354    }
355    None
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361    use crate::parser::RobotsParser;
362
363    fn create_test_policy() -> RobotsPolicy {
364        let parser = RobotsParser::new();
365        parser.parse("User-agent: *\nDisallow: /admin", Duration::from_secs(3600))
366    }
367
368    #[test]
369    fn test_cache_insert_get() {
370        let cache = RobotsCache::new(Duration::from_secs(3600));
371        let key = RobotsCacheKey {
372            scheme: "https".to_string(),
373            authority: "example.com".to_string(),
374        };
375        let policy = create_test_policy();
376        
377        cache.insert(key.clone(), policy);
378        
379        assert!(cache.get(&key).is_some());
380        assert_eq!(cache.len(), 1);
381    }
382
383    #[test]
384    fn test_cache_miss() {
385        let cache = RobotsCache::new(Duration::from_secs(3600));
386        let key = RobotsCacheKey {
387            scheme: "https".to_string(),
388            authority: "example.com".to_string(),
389        };
390        
391        assert!(cache.get(&key).is_none());
392    }
393
394    #[test]
395    fn test_cache_stats() {
396        let cache = RobotsCache::new(Duration::from_secs(3600));
397        let key = RobotsCacheKey {
398            scheme: "https".to_string(),
399            authority: "example.com".to_string(),
400        };
401        
402        // Miss
403        cache.get(&key);
404        
405        // Insert and hit
406        cache.insert(key.clone(), create_test_policy());
407        cache.get(&key);
408        
409        let stats = cache.stats().snapshot();
410        assert_eq!(stats.hits, 1);
411        assert_eq!(stats.misses, 1);
412    }
413
414    #[test]
415    fn test_cache_clear() {
416        let cache = RobotsCache::new(Duration::from_secs(3600));
417        let key = RobotsCacheKey {
418            scheme: "https".to_string(),
419            authority: "example.com".to_string(),
420        };
421        
422        cache.insert(key, create_test_policy());
423        assert_eq!(cache.len(), 1);
424        
425        cache.clear();
426        assert!(cache.is_empty());
427    }
428
429    #[test]
430    fn test_parse_cache_control() {
431        assert_eq!(
432            parse_cache_control("max-age=3600"),
433            Some(Duration::from_secs(3600))
434        );
435        assert_eq!(
436            parse_cache_control("public, max-age=7200"),
437            Some(Duration::from_secs(7200))
438        );
439        assert_eq!(
440            parse_cache_control("no-cache"),
441            None
442        );
443        // Should clamp to max 24h
444        assert_eq!(
445            parse_cache_control("max-age=999999"),
446            Some(MAX_CACHE_TTL)
447        );
448    }
449
450    #[test]
451    fn test_max_ttl_enforcement() {
452        // TTL should be clamped to max 24h
453        let cache = RobotsCache::new(Duration::from_secs(100000));
454        assert_eq!(cache.default_ttl(), MAX_CACHE_TTL);
455    }
456}