Skip to main content

essence/cache/
mod.rs

1//! Smart caching layer with content, robots.txt, and redirect caching
2//!
3//! This module provides an intelligent caching system using the moka crate
4//! with appropriate TTLs and cache-first strategies.
5
6use blake3::Hasher;
7use moka::future::Cache;
8use serde::{Deserialize, Serialize};
9use std::sync::Arc;
10use std::time::Duration;
11use tracing::{debug, info};
12
13/// Cache configuration
14#[derive(Debug, Clone)]
15pub struct CacheConfig {
16    /// Content cache TTL (default: 1 hour)
17    pub content_ttl_secs: u64,
18    /// Robots.txt cache TTL (default: 24 hours)
19    pub robots_ttl_secs: u64,
20    /// Redirect cache TTL (default: 1 hour)
21    pub redirect_ttl_secs: u64,
22    /// Max cache entries
23    pub max_capacity: u64,
24}
25
26impl Default for CacheConfig {
27    fn default() -> Self {
28        Self {
29            content_ttl_secs: 3600,      // 1 hour
30            robots_ttl_secs: 86400,      // 24 hours
31            redirect_ttl_secs: 3600,     // 1 hour
32            max_capacity: 10000,         // 10k entries
33        }
34    }
35}
36
37/// Cached content with metadata
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct CachedContent {
40    pub html: String,
41    pub status_code: u16,
42    pub content_type: Option<String>,
43    pub headers: Vec<(String, String)>,
44    pub cached_at: u64,
45}
46
47/// Cached robots.txt content
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct CachedRobots {
50    pub content: String,
51    pub cached_at: u64,
52}
53
54/// Cached redirect mapping
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct CachedRedirect {
57    pub target_url: String,
58    pub status_code: u16,
59    pub cached_at: u64,
60}
61
62/// Cache metrics for monitoring
63#[derive(Debug, Clone, Default)]
64pub struct CacheMetrics {
65    pub content_hits: u64,
66    pub content_misses: u64,
67    pub robots_hits: u64,
68    pub robots_misses: u64,
69    pub redirect_hits: u64,
70    pub redirect_misses: u64,
71}
72
73/// Main cache layer with three specialized caches
74pub struct CacheLayer {
75    /// Content cache: maps URL+headers -> cached HTML/content
76    content_cache: Cache<String, CachedContent>,
77    /// Robots.txt cache: maps domain -> robots.txt content
78    robots_cache: Cache<String, CachedRobots>,
79    /// Redirect cache: maps URL -> target URL
80    redirect_cache: Cache<String, CachedRedirect>,
81    /// Cache metrics
82    metrics: Arc<tokio::sync::RwLock<CacheMetrics>>,
83    /// Configuration
84    #[allow(dead_code)]
85    config: CacheConfig,
86}
87
88impl CacheLayer {
89    /// Create a new cache layer with default configuration
90    pub fn new() -> Self {
91        Self::with_config(CacheConfig::default())
92    }
93
94    /// Create a new cache layer with custom configuration
95    pub fn with_config(config: CacheConfig) -> Self {
96        info!(
97            "Initializing cache layer: content_ttl={}s, robots_ttl={}s, redirect_ttl={}s, max_capacity={}",
98            config.content_ttl_secs, config.robots_ttl_secs, config.redirect_ttl_secs, config.max_capacity
99        );
100
101        let content_cache = Cache::builder()
102            .max_capacity(config.max_capacity)
103            .time_to_live(Duration::from_secs(config.content_ttl_secs))
104            .build();
105
106        let robots_cache = Cache::builder()
107            .max_capacity(config.max_capacity / 10) // Fewer robots.txt entries
108            .time_to_live(Duration::from_secs(config.robots_ttl_secs))
109            .build();
110
111        let redirect_cache = Cache::builder()
112            .max_capacity(config.max_capacity / 10) // Fewer redirect entries
113            .time_to_live(Duration::from_secs(config.redirect_ttl_secs))
114            .build();
115
116        Self {
117            content_cache,
118            robots_cache,
119            redirect_cache,
120            metrics: Arc::new(tokio::sync::RwLock::new(CacheMetrics::default())),
121            config,
122        }
123    }
124
125    /// Generate a cache key from URL and optional headers
126    ///
127    /// Uses BLAKE3 hashing for fast, collision-resistant cache keys.
128    ///
129    /// # Example
130    /// ```ignore
131    /// let key = CacheLayer::generate_cache_key("https://example.com", None);
132    /// ```
133    pub fn generate_cache_key(url: &str, headers: Option<&[(String, String)]>) -> String {
134        let mut hasher = Hasher::new();
135        hasher.update(url.as_bytes());
136        
137        if let Some(headers) = headers {
138            for (key, value) in headers {
139                hasher.update(key.as_bytes());
140                hasher.update(value.as_bytes());
141            }
142        }
143        
144        hasher.finalize().to_hex().to_string()
145    }
146
147    /// Get cached content or fetch it using the provided async function
148    ///
149    /// This implements a cache-first strategy where the cache is checked first,
150    /// and only if there's a miss does it call the fetch function.
151    ///
152    /// # Example
153    /// ```ignore
154    /// let content = cache.get_or_fetch_content(
155    ///     "https://example.com",
156    ///     None,
157    ///     || async {
158    ///         // Fetch logic here
159    ///         Ok(CachedContent { ... })
160    ///     }
161    /// ).await?;
162    /// ```
163    pub async fn get_or_fetch_content<F, Fut>(
164        &self,
165        url: &str,
166        headers: Option<&[(String, String)]>,
167        fetch_fn: F,
168    ) -> Result<CachedContent, crate::error::ScrapeError>
169    where
170        F: FnOnce() -> Fut,
171        Fut: std::future::Future<Output = Result<CachedContent, crate::error::ScrapeError>>,
172    {
173        let cache_key = Self::generate_cache_key(url, headers);
174
175        // Try to get from cache
176        if let Some(cached) = self.content_cache.get(&cache_key).await {
177            debug!("Cache hit for URL: {}", url);
178            let mut metrics_data = self.metrics.write().await;
179            metrics_data.content_hits += 1;
180            drop(metrics_data);
181
182
183            return Ok(cached);
184        }
185
186        // Cache miss - fetch content
187        debug!("Cache miss for URL: {}", url);
188        let mut metrics_data = self.metrics.write().await;
189        metrics_data.content_misses += 1;
190        drop(metrics_data);
191
192
193        let content = fetch_fn().await?;
194        
195        // Store in cache
196        self.content_cache.insert(cache_key, content.clone()).await;
197        
198        Ok(content)
199    }
200
201    /// Get cached robots.txt or fetch it
202    pub async fn get_or_fetch_robots<F, Fut>(
203        &self,
204        domain: &str,
205        fetch_fn: F,
206    ) -> Result<CachedRobots, crate::error::ScrapeError>
207    where
208        F: FnOnce() -> Fut,
209        Fut: std::future::Future<Output = Result<CachedRobots, crate::error::ScrapeError>>,
210    {
211        // Try to get from cache
212        if let Some(cached) = self.robots_cache.get(domain).await {
213            debug!("Cache hit for robots.txt: {}", domain);
214            let mut metrics_data = self.metrics.write().await;
215            metrics_data.robots_hits += 1;
216            drop(metrics_data);
217
218
219            return Ok(cached);
220        }
221
222        // Cache miss - fetch robots.txt
223        debug!("Cache miss for robots.txt: {}", domain);
224        let mut metrics_data = self.metrics.write().await;
225        metrics_data.robots_misses += 1;
226        drop(metrics_data);
227
228
229        let robots = fetch_fn().await?;
230        
231        // Store in cache
232        self.robots_cache.insert(domain.to_string(), robots.clone()).await;
233        
234        Ok(robots)
235    }
236
237    /// Get cached redirect or fetch it
238    pub async fn get_or_fetch_redirect<F, Fut>(
239        &self,
240        url: &str,
241        fetch_fn: F,
242    ) -> Result<Option<CachedRedirect>, crate::error::ScrapeError>
243    where
244        F: FnOnce() -> Fut,
245        Fut: std::future::Future<Output = Result<Option<CachedRedirect>, crate::error::ScrapeError>>,
246    {
247        // Try to get from cache
248        if let Some(cached) = self.redirect_cache.get(url).await {
249            debug!("Cache hit for redirect: {}", url);
250            let mut metrics_data = self.metrics.write().await;
251            metrics_data.redirect_hits += 1;
252            drop(metrics_data);
253
254
255            return Ok(Some(cached));
256        }
257
258        // Cache miss - fetch redirect
259        debug!("Cache miss for redirect: {}", url);
260        let mut metrics_data = self.metrics.write().await;
261        metrics_data.redirect_misses += 1;
262        drop(metrics_data);
263
264
265        let redirect = fetch_fn().await?;
266        
267        // Store in cache if redirect exists
268        if let Some(ref redir) = redirect {
269            self.redirect_cache.insert(url.to_string(), redir.clone()).await;
270        }
271        
272        Ok(redirect)
273    }
274
275    /// Get current cache metrics
276    pub async fn get_metrics(&self) -> CacheMetrics {
277        self.metrics.read().await.clone()
278    }
279
280    /// Get cache statistics
281    pub async fn get_stats(&self) -> CacheStats {
282        let metrics_data = self.metrics.read().await;
283
284        let content_size = self.content_cache.entry_count();
285        let robots_size = self.robots_cache.entry_count();
286        let redirect_size = self.redirect_cache.entry_count();
287
288        CacheStats {
289            content_size,
290            robots_size,
291            redirect_size,
292            content_hits: metrics_data.content_hits,
293            content_misses: metrics_data.content_misses,
294            robots_hits: metrics_data.robots_hits,
295            robots_misses: metrics_data.robots_misses,
296            redirect_hits: metrics_data.redirect_hits,
297            redirect_misses: metrics_data.redirect_misses,
298        }
299    }
300
301    /// Clear all caches
302    pub async fn clear_all(&self) {
303        self.content_cache.invalidate_all();
304        self.robots_cache.invalidate_all();
305        self.redirect_cache.invalidate_all();
306        info!("All caches cleared");
307    }
308
309    /// Clear content cache only
310    pub async fn clear_content(&self) {
311        self.content_cache.invalidate_all();
312        info!("Content cache cleared");
313    }
314
315    /// Clear robots cache only
316    pub async fn clear_robots(&self) {
317        self.robots_cache.invalidate_all();
318        info!("Robots cache cleared");
319    }
320
321    /// Clear redirect cache only
322    pub async fn clear_redirect(&self) {
323        self.redirect_cache.invalidate_all();
324        info!("Redirect cache cleared");
325    }
326}
327
328impl Default for CacheLayer {
329    fn default() -> Self {
330        Self::new()
331    }
332}
333
334/// Cache statistics
335#[derive(Debug, Clone, Serialize)]
336pub struct CacheStats {
337    pub content_size: u64,
338    pub robots_size: u64,
339    pub redirect_size: u64,
340    pub content_hits: u64,
341    pub content_misses: u64,
342    pub robots_hits: u64,
343    pub robots_misses: u64,
344    pub redirect_hits: u64,
345    pub redirect_misses: u64,
346}
347
348impl CacheStats {
349    /// Calculate content cache hit rate
350    pub fn content_hit_rate(&self) -> f64 {
351        let total = self.content_hits + self.content_misses;
352        if total == 0 {
353            0.0
354        } else {
355            self.content_hits as f64 / total as f64
356        }
357    }
358
359    /// Calculate robots cache hit rate
360    pub fn robots_hit_rate(&self) -> f64 {
361        let total = self.robots_hits + self.robots_misses;
362        if total == 0 {
363            0.0
364        } else {
365            self.robots_hits as f64 / total as f64
366        }
367    }
368
369    /// Calculate redirect cache hit rate
370    pub fn redirect_hit_rate(&self) -> f64 {
371        let total = self.redirect_hits + self.redirect_misses;
372        if total == 0 {
373            0.0
374        } else {
375            self.redirect_hits as f64 / total as f64
376        }
377    }
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383
384    #[test]
385    fn test_cache_key_generation() {
386        let url = "https://example.com";
387        let headers1 = vec![("User-Agent".to_string(), "test".to_string())];
388        let headers2 = vec![("User-Agent".to_string(), "test".to_string())];
389        let headers3 = vec![("User-Agent".to_string(), "different".to_string())];
390
391        let key1 = CacheLayer::generate_cache_key(url, Some(&headers1));
392        let key2 = CacheLayer::generate_cache_key(url, Some(&headers2));
393        let key3 = CacheLayer::generate_cache_key(url, Some(&headers3));
394        let key_no_headers = CacheLayer::generate_cache_key(url, None);
395
396        assert_eq!(key1, key2, "Same URL and headers should produce same key");
397        assert_ne!(key1, key3, "Different headers should produce different keys");
398        assert_ne!(key1, key_no_headers, "With and without headers should differ");
399    }
400
401    #[tokio::test]
402    async fn test_cache_layer_creation() {
403        let cache = CacheLayer::new();
404        let stats = cache.get_stats().await;
405        
406        assert_eq!(stats.content_size, 0);
407        assert_eq!(stats.robots_size, 0);
408        assert_eq!(stats.redirect_size, 0);
409    }
410
411    #[tokio::test]
412    async fn test_content_caching() {
413        let cache = CacheLayer::new();
414        let url = "https://example.com";
415        
416        let mut fetch_count = 0;
417        
418        // First fetch - should miss cache
419        let _content1 = cache.get_or_fetch_content(url, None, || async {
420            fetch_count += 1;
421            Ok(CachedContent {
422                html: "<html></html>".to_string(),
423                status_code: 200,
424                content_type: Some("text/html".to_string()),
425                headers: vec![],
426                cached_at: 0,
427            })
428        }).await.unwrap();
429        
430        assert_eq!(fetch_count, 1);
431        
432        // Second fetch - should hit cache
433        let _content2 = cache.get_or_fetch_content(url, None, || async {
434            fetch_count += 1;
435            Ok(CachedContent {
436                html: "<html></html>".to_string(),
437                status_code: 200,
438                content_type: Some("text/html".to_string()),
439                headers: vec![],
440                cached_at: 0,
441            })
442        }).await.unwrap();
443        
444        assert_eq!(fetch_count, 1, "Fetch function should not be called on cache hit");
445        
446        let stats = cache.get_stats().await;
447        assert_eq!(stats.content_hits, 1);
448        assert_eq!(stats.content_misses, 1);
449    }
450}