1use crate::types::{RobotsCacheKey, RobotsPolicy};
4use dashmap::DashMap;
5use std::path::Path;
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::sync::Arc;
8use std::time::Duration;
9use tokio::fs;
10use tokio::io::{AsyncReadExt, AsyncWriteExt};
11use tracing::{debug, info};
12
13pub const MAX_CACHE_TTL: Duration = Duration::from_secs(24 * 60 * 60);
15
16pub const DEFAULT_CACHE_TTL: Duration = Duration::from_secs(60 * 60);
18
19#[derive(Debug, Default)]
21pub struct CacheStats {
22 pub hits: AtomicU64,
24 pub misses: AtomicU64,
26 pub evictions: AtomicU64,
28 pub entries: AtomicU64,
30}
31
32impl CacheStats {
33 pub fn record_hit(&self) {
35 self.hits.fetch_add(1, Ordering::Relaxed);
36 }
37
38 pub fn record_miss(&self) {
40 self.misses.fetch_add(1, Ordering::Relaxed);
41 }
42
43 pub fn record_eviction(&self) {
45 self.evictions.fetch_add(1, Ordering::Relaxed);
46 }
47
48 pub fn hit_rate(&self) -> f64 {
50 let hits = self.hits.load(Ordering::Relaxed);
51 let misses = self.misses.load(Ordering::Relaxed);
52 let total = hits + misses;
53 if total == 0 {
54 0.0
55 } else {
56 hits as f64 / total as f64
57 }
58 }
59
60 pub fn snapshot(&self) -> CacheStatsSnapshot {
62 CacheStatsSnapshot {
63 hits: self.hits.load(Ordering::Relaxed),
64 misses: self.misses.load(Ordering::Relaxed),
65 evictions: self.evictions.load(Ordering::Relaxed),
66 entries: self.entries.load(Ordering::Relaxed),
67 }
68 }
69}
70
71#[derive(Debug, Clone)]
73pub struct CacheStatsSnapshot {
74 pub hits: u64,
76 pub misses: u64,
78 pub evictions: u64,
80 pub entries: u64,
82}
83
84pub struct RobotsCache {
86 cache: Arc<DashMap<RobotsCacheKey, RobotsPolicy>>,
88 default_ttl: Duration,
90 persist_dir: Option<String>,
92 stats: Arc<CacheStats>,
94}
95
96impl Default for RobotsCache {
97 fn default() -> Self {
98 Self::new(DEFAULT_CACHE_TTL)
99 }
100}
101
102impl RobotsCache {
103 pub fn new(default_ttl: Duration) -> Self {
105 let default_ttl = default_ttl.min(MAX_CACHE_TTL);
107
108 Self {
109 cache: Arc::new(DashMap::new()),
110 default_ttl,
111 persist_dir: None,
112 stats: Arc::new(CacheStats::default()),
113 }
114 }
115
116 pub fn with_persistence(default_ttl: Duration, persist_dir: &str) -> Self {
118 let default_ttl = default_ttl.min(MAX_CACHE_TTL);
119
120 Self {
121 cache: Arc::new(DashMap::new()),
122 default_ttl,
123 persist_dir: Some(persist_dir.to_string()),
124 stats: Arc::new(CacheStats::default()),
125 }
126 }
127
128 pub fn default_ttl(&self) -> Duration {
130 self.default_ttl
131 }
132
133 pub fn stats(&self) -> Arc<CacheStats> {
135 self.stats.clone()
136 }
137
138 pub fn get(&self, key: &RobotsCacheKey) -> Option<RobotsPolicy> {
140 if let Some(entry) = self.cache.get(key) {
141 if !entry.is_expired() {
142 self.stats.record_hit();
143 debug!("Cache hit for {}", key.robots_url());
144 return Some(entry.clone());
145 }
146 drop(entry);
148 self.cache.remove(key);
149 self.stats.record_eviction();
150 }
151
152 self.stats.record_miss();
153 debug!("Cache miss for {}", key.robots_url());
154 None
155 }
156
157 pub fn insert(&self, key: RobotsCacheKey, policy: RobotsPolicy) {
159 let old = self.cache.insert(key.clone(), policy);
160 if old.is_none() {
161 self.stats.entries.fetch_add(1, Ordering::Relaxed);
162 }
163 debug!("Cached robots.txt for {}", key.robots_url());
164 }
165
166 pub fn remove(&self, key: &RobotsCacheKey) -> Option<RobotsPolicy> {
168 let removed = self.cache.remove(key).map(|(_, v)| v);
169 if removed.is_some() {
170 self.stats.entries.fetch_sub(1, Ordering::Relaxed);
171 self.stats.record_eviction();
172 }
173 removed
174 }
175
176 pub fn evict_expired(&self) -> usize {
178 let mut evicted = 0;
179 self.cache.retain(|_, policy| {
180 if policy.is_expired() {
181 evicted += 1;
182 false
183 } else {
184 true
185 }
186 });
187
188 if evicted > 0 {
189 self.stats.entries.fetch_sub(evicted as u64, Ordering::Relaxed);
190 self.stats.evictions.fetch_add(evicted as u64, Ordering::Relaxed);
191 info!("Evicted {} expired robots.txt entries", evicted);
192 }
193
194 evicted
195 }
196
197 pub fn clear(&self) {
199 let count = self.cache.len();
200 self.cache.clear();
201 self.stats.entries.store(0, Ordering::Relaxed);
202 info!("Cleared {} robots.txt cache entries", count);
203 }
204
205 pub fn len(&self) -> usize {
207 self.cache.len()
208 }
209
210 pub fn is_empty(&self) -> bool {
212 self.cache.is_empty()
213 }
214
215 pub fn domains(&self) -> Vec<String> {
217 self.cache
218 .iter()
219 .map(|entry| entry.key().authority.clone())
220 .collect()
221 }
222
223 pub async fn save_to_disk(&self) -> std::io::Result<usize> {
225 let persist_dir = match &self.persist_dir {
226 Some(dir) => dir,
227 None => return Ok(0),
228 };
229
230 fs::create_dir_all(persist_dir).await?;
232
233 let mut saved = 0;
234 for entry in self.cache.iter() {
235 let key = entry.key();
236 let policy = entry.value();
237
238 if policy.is_expired() {
240 continue;
241 }
242
243 let filename = self.cache_filename(key);
244 let filepath = Path::new(persist_dir).join(&filename);
245
246 if let Ok(json) = serde_json::to_string_pretty(&CacheEntry {
248 key: key.clone(),
249 groups: policy.groups.clone(),
250 sitemaps: policy.sitemaps.clone(),
251 content_size: policy.content_size,
252 ttl_secs: policy.ttl().as_secs(),
253 }) {
254 if let Ok(mut file) = fs::File::create(&filepath).await {
255 if file.write_all(json.as_bytes()).await.is_ok() {
256 saved += 1;
257 }
258 }
259 }
260 }
261
262 info!("Saved {} robots.txt entries to disk", saved);
263 Ok(saved)
264 }
265
266 pub async fn load_from_disk(&self) -> std::io::Result<usize> {
268 let persist_dir = match &self.persist_dir {
269 Some(dir) => dir,
270 None => return Ok(0),
271 };
272
273 let path = Path::new(persist_dir);
274 if !path.exists() {
275 return Ok(0);
276 }
277
278 let mut loaded = 0;
279 let mut entries = fs::read_dir(persist_dir).await?;
280
281 while let Some(entry) = entries.next_entry().await? {
282 let filepath = entry.path();
283 if filepath.extension().is_some_and(|ext| ext == "json") {
284 if let Ok(mut file) = fs::File::open(&filepath).await {
285 let mut content = String::new();
286 if file.read_to_string(&mut content).await.is_ok() {
287 if let Ok(cache_entry) = serde_json::from_str::<CacheEntry>(&content) {
288 let ttl = Duration::from_secs(cache_entry.ttl_secs);
290 if ttl > Duration::ZERO {
291 let now = std::time::SystemTime::now()
292 .duration_since(std::time::UNIX_EPOCH)
293 .unwrap_or_default()
294 .as_millis() as u64;
295 let policy = RobotsPolicy {
296 fetched_at_ms: now,
297 expires_at_ms: now + ttl.as_millis() as u64,
298 fetch_status: crate::types::FetchStatus::Success,
299 groups: cache_entry.groups,
300 sitemaps: cache_entry.sitemaps,
301 content_size: cache_entry.content_size,
302 etag: None,
303 last_modified: None,
304 };
305 self.insert(cache_entry.key, policy);
306 loaded += 1;
307 }
308 }
309 }
310 }
311 }
312 }
313
314 info!("Loaded {} robots.txt entries from disk", loaded);
315 Ok(loaded)
316 }
317
318 fn cache_filename(&self, key: &RobotsCacheKey) -> String {
320 let combined = format!("{}_{}", key.scheme, key.authority);
322 let encoded = base64_encode(&combined);
323 format!("{}.json", encoded)
324 }
325}
326
327#[derive(serde::Serialize, serde::Deserialize)]
329struct CacheEntry {
330 key: RobotsCacheKey,
331 groups: Vec<crate::types::Group>,
332 sitemaps: Vec<String>,
333 content_size: usize,
334 ttl_secs: u64,
335}
336
337fn base64_encode(s: &str) -> String {
339 s.bytes()
341 .map(|b| format!("{:02x}", b))
342 .collect()
343}
344
345pub fn parse_cache_control(header: &str) -> Option<Duration> {
347 for directive in header.split(',') {
348 let directive = directive.trim();
349 if let Some(value) = directive.strip_prefix("max-age=") {
350 if let Ok(secs) = value.trim().parse::<u64>() {
351 return Some(Duration::from_secs(secs).min(MAX_CACHE_TTL));
352 }
353 }
354 }
355 None
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361 use crate::parser::RobotsParser;
362
363 fn create_test_policy() -> RobotsPolicy {
364 let parser = RobotsParser::new();
365 parser.parse("User-agent: *\nDisallow: /admin", Duration::from_secs(3600))
366 }
367
368 #[test]
369 fn test_cache_insert_get() {
370 let cache = RobotsCache::new(Duration::from_secs(3600));
371 let key = RobotsCacheKey {
372 scheme: "https".to_string(),
373 authority: "example.com".to_string(),
374 };
375 let policy = create_test_policy();
376
377 cache.insert(key.clone(), policy);
378
379 assert!(cache.get(&key).is_some());
380 assert_eq!(cache.len(), 1);
381 }
382
383 #[test]
384 fn test_cache_miss() {
385 let cache = RobotsCache::new(Duration::from_secs(3600));
386 let key = RobotsCacheKey {
387 scheme: "https".to_string(),
388 authority: "example.com".to_string(),
389 };
390
391 assert!(cache.get(&key).is_none());
392 }
393
394 #[test]
395 fn test_cache_stats() {
396 let cache = RobotsCache::new(Duration::from_secs(3600));
397 let key = RobotsCacheKey {
398 scheme: "https".to_string(),
399 authority: "example.com".to_string(),
400 };
401
402 cache.get(&key);
404
405 cache.insert(key.clone(), create_test_policy());
407 cache.get(&key);
408
409 let stats = cache.stats().snapshot();
410 assert_eq!(stats.hits, 1);
411 assert_eq!(stats.misses, 1);
412 }
413
414 #[test]
415 fn test_cache_clear() {
416 let cache = RobotsCache::new(Duration::from_secs(3600));
417 let key = RobotsCacheKey {
418 scheme: "https".to_string(),
419 authority: "example.com".to_string(),
420 };
421
422 cache.insert(key, create_test_policy());
423 assert_eq!(cache.len(), 1);
424
425 cache.clear();
426 assert!(cache.is_empty());
427 }
428
429 #[test]
430 fn test_parse_cache_control() {
431 assert_eq!(
432 parse_cache_control("max-age=3600"),
433 Some(Duration::from_secs(3600))
434 );
435 assert_eq!(
436 parse_cache_control("public, max-age=7200"),
437 Some(Duration::from_secs(7200))
438 );
439 assert_eq!(
440 parse_cache_control("no-cache"),
441 None
442 );
443 assert_eq!(
445 parse_cache_control("max-age=999999"),
446 Some(MAX_CACHE_TTL)
447 );
448 }
449
450 #[test]
451 fn test_max_ttl_enforcement() {
452 let cache = RobotsCache::new(Duration::from_secs(100000));
454 assert_eq!(cache.default_ttl(), MAX_CACHE_TTL);
455 }
456}