1use blake3::Hasher;
7use moka::future::Cache;
8use serde::{Deserialize, Serialize};
9use std::sync::Arc;
10use std::time::Duration;
11use tracing::{debug, info};
12
13#[derive(Debug, Clone)]
15pub struct CacheConfig {
16 pub content_ttl_secs: u64,
18 pub robots_ttl_secs: u64,
20 pub redirect_ttl_secs: u64,
22 pub max_capacity: u64,
24}
25
26impl Default for CacheConfig {
27 fn default() -> Self {
28 Self {
29 content_ttl_secs: 3600, robots_ttl_secs: 86400, redirect_ttl_secs: 3600, max_capacity: 10000, }
34 }
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct CachedContent {
40 pub html: String,
41 pub status_code: u16,
42 pub content_type: Option<String>,
43 pub headers: Vec<(String, String)>,
44 pub cached_at: u64,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct CachedRobots {
50 pub content: String,
51 pub cached_at: u64,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct CachedRedirect {
57 pub target_url: String,
58 pub status_code: u16,
59 pub cached_at: u64,
60}
61
62#[derive(Debug, Clone, Default)]
64pub struct CacheMetrics {
65 pub content_hits: u64,
66 pub content_misses: u64,
67 pub robots_hits: u64,
68 pub robots_misses: u64,
69 pub redirect_hits: u64,
70 pub redirect_misses: u64,
71}
72
73pub struct CacheLayer {
75 content_cache: Cache<String, CachedContent>,
77 robots_cache: Cache<String, CachedRobots>,
79 redirect_cache: Cache<String, CachedRedirect>,
81 metrics: Arc<tokio::sync::RwLock<CacheMetrics>>,
83 #[allow(dead_code)]
85 config: CacheConfig,
86}
87
88impl CacheLayer {
89 pub fn new() -> Self {
91 Self::with_config(CacheConfig::default())
92 }
93
94 pub fn with_config(config: CacheConfig) -> Self {
96 info!(
97 "Initializing cache layer: content_ttl={}s, robots_ttl={}s, redirect_ttl={}s, max_capacity={}",
98 config.content_ttl_secs, config.robots_ttl_secs, config.redirect_ttl_secs, config.max_capacity
99 );
100
101 let content_cache = Cache::builder()
102 .max_capacity(config.max_capacity)
103 .time_to_live(Duration::from_secs(config.content_ttl_secs))
104 .build();
105
106 let robots_cache = Cache::builder()
107 .max_capacity(config.max_capacity / 10) .time_to_live(Duration::from_secs(config.robots_ttl_secs))
109 .build();
110
111 let redirect_cache = Cache::builder()
112 .max_capacity(config.max_capacity / 10) .time_to_live(Duration::from_secs(config.redirect_ttl_secs))
114 .build();
115
116 Self {
117 content_cache,
118 robots_cache,
119 redirect_cache,
120 metrics: Arc::new(tokio::sync::RwLock::new(CacheMetrics::default())),
121 config,
122 }
123 }
124
125 pub fn generate_cache_key(url: &str, headers: Option<&[(String, String)]>) -> String {
134 let mut hasher = Hasher::new();
135 hasher.update(url.as_bytes());
136
137 if let Some(headers) = headers {
138 for (key, value) in headers {
139 hasher.update(key.as_bytes());
140 hasher.update(value.as_bytes());
141 }
142 }
143
144 hasher.finalize().to_hex().to_string()
145 }
146
147 pub async fn get_or_fetch_content<F, Fut>(
164 &self,
165 url: &str,
166 headers: Option<&[(String, String)]>,
167 fetch_fn: F,
168 ) -> Result<CachedContent, crate::error::ScrapeError>
169 where
170 F: FnOnce() -> Fut,
171 Fut: std::future::Future<Output = Result<CachedContent, crate::error::ScrapeError>>,
172 {
173 let cache_key = Self::generate_cache_key(url, headers);
174
175 if let Some(cached) = self.content_cache.get(&cache_key).await {
177 debug!("Cache hit for URL: {}", url);
178 let mut metrics_data = self.metrics.write().await;
179 metrics_data.content_hits += 1;
180 drop(metrics_data);
181
182
183 return Ok(cached);
184 }
185
186 debug!("Cache miss for URL: {}", url);
188 let mut metrics_data = self.metrics.write().await;
189 metrics_data.content_misses += 1;
190 drop(metrics_data);
191
192
193 let content = fetch_fn().await?;
194
195 self.content_cache.insert(cache_key, content.clone()).await;
197
198 Ok(content)
199 }
200
201 pub async fn get_or_fetch_robots<F, Fut>(
203 &self,
204 domain: &str,
205 fetch_fn: F,
206 ) -> Result<CachedRobots, crate::error::ScrapeError>
207 where
208 F: FnOnce() -> Fut,
209 Fut: std::future::Future<Output = Result<CachedRobots, crate::error::ScrapeError>>,
210 {
211 if let Some(cached) = self.robots_cache.get(domain).await {
213 debug!("Cache hit for robots.txt: {}", domain);
214 let mut metrics_data = self.metrics.write().await;
215 metrics_data.robots_hits += 1;
216 drop(metrics_data);
217
218
219 return Ok(cached);
220 }
221
222 debug!("Cache miss for robots.txt: {}", domain);
224 let mut metrics_data = self.metrics.write().await;
225 metrics_data.robots_misses += 1;
226 drop(metrics_data);
227
228
229 let robots = fetch_fn().await?;
230
231 self.robots_cache.insert(domain.to_string(), robots.clone()).await;
233
234 Ok(robots)
235 }
236
237 pub async fn get_or_fetch_redirect<F, Fut>(
239 &self,
240 url: &str,
241 fetch_fn: F,
242 ) -> Result<Option<CachedRedirect>, crate::error::ScrapeError>
243 where
244 F: FnOnce() -> Fut,
245 Fut: std::future::Future<Output = Result<Option<CachedRedirect>, crate::error::ScrapeError>>,
246 {
247 if let Some(cached) = self.redirect_cache.get(url).await {
249 debug!("Cache hit for redirect: {}", url);
250 let mut metrics_data = self.metrics.write().await;
251 metrics_data.redirect_hits += 1;
252 drop(metrics_data);
253
254
255 return Ok(Some(cached));
256 }
257
258 debug!("Cache miss for redirect: {}", url);
260 let mut metrics_data = self.metrics.write().await;
261 metrics_data.redirect_misses += 1;
262 drop(metrics_data);
263
264
265 let redirect = fetch_fn().await?;
266
267 if let Some(ref redir) = redirect {
269 self.redirect_cache.insert(url.to_string(), redir.clone()).await;
270 }
271
272 Ok(redirect)
273 }
274
275 pub async fn get_metrics(&self) -> CacheMetrics {
277 self.metrics.read().await.clone()
278 }
279
280 pub async fn get_stats(&self) -> CacheStats {
282 let metrics_data = self.metrics.read().await;
283
284 let content_size = self.content_cache.entry_count();
285 let robots_size = self.robots_cache.entry_count();
286 let redirect_size = self.redirect_cache.entry_count();
287
288 CacheStats {
289 content_size,
290 robots_size,
291 redirect_size,
292 content_hits: metrics_data.content_hits,
293 content_misses: metrics_data.content_misses,
294 robots_hits: metrics_data.robots_hits,
295 robots_misses: metrics_data.robots_misses,
296 redirect_hits: metrics_data.redirect_hits,
297 redirect_misses: metrics_data.redirect_misses,
298 }
299 }
300
301 pub async fn clear_all(&self) {
303 self.content_cache.invalidate_all();
304 self.robots_cache.invalidate_all();
305 self.redirect_cache.invalidate_all();
306 info!("All caches cleared");
307 }
308
309 pub async fn clear_content(&self) {
311 self.content_cache.invalidate_all();
312 info!("Content cache cleared");
313 }
314
315 pub async fn clear_robots(&self) {
317 self.robots_cache.invalidate_all();
318 info!("Robots cache cleared");
319 }
320
321 pub async fn clear_redirect(&self) {
323 self.redirect_cache.invalidate_all();
324 info!("Redirect cache cleared");
325 }
326}
327
328impl Default for CacheLayer {
329 fn default() -> Self {
330 Self::new()
331 }
332}
333
334#[derive(Debug, Clone, Serialize)]
336pub struct CacheStats {
337 pub content_size: u64,
338 pub robots_size: u64,
339 pub redirect_size: u64,
340 pub content_hits: u64,
341 pub content_misses: u64,
342 pub robots_hits: u64,
343 pub robots_misses: u64,
344 pub redirect_hits: u64,
345 pub redirect_misses: u64,
346}
347
348impl CacheStats {
349 pub fn content_hit_rate(&self) -> f64 {
351 let total = self.content_hits + self.content_misses;
352 if total == 0 {
353 0.0
354 } else {
355 self.content_hits as f64 / total as f64
356 }
357 }
358
359 pub fn robots_hit_rate(&self) -> f64 {
361 let total = self.robots_hits + self.robots_misses;
362 if total == 0 {
363 0.0
364 } else {
365 self.robots_hits as f64 / total as f64
366 }
367 }
368
369 pub fn redirect_hit_rate(&self) -> f64 {
371 let total = self.redirect_hits + self.redirect_misses;
372 if total == 0 {
373 0.0
374 } else {
375 self.redirect_hits as f64 / total as f64
376 }
377 }
378}
379
380#[cfg(test)]
381mod tests {
382 use super::*;
383
384 #[test]
385 fn test_cache_key_generation() {
386 let url = "https://example.com";
387 let headers1 = vec![("User-Agent".to_string(), "test".to_string())];
388 let headers2 = vec![("User-Agent".to_string(), "test".to_string())];
389 let headers3 = vec![("User-Agent".to_string(), "different".to_string())];
390
391 let key1 = CacheLayer::generate_cache_key(url, Some(&headers1));
392 let key2 = CacheLayer::generate_cache_key(url, Some(&headers2));
393 let key3 = CacheLayer::generate_cache_key(url, Some(&headers3));
394 let key_no_headers = CacheLayer::generate_cache_key(url, None);
395
396 assert_eq!(key1, key2, "Same URL and headers should produce same key");
397 assert_ne!(key1, key3, "Different headers should produce different keys");
398 assert_ne!(key1, key_no_headers, "With and without headers should differ");
399 }
400
401 #[tokio::test]
402 async fn test_cache_layer_creation() {
403 let cache = CacheLayer::new();
404 let stats = cache.get_stats().await;
405
406 assert_eq!(stats.content_size, 0);
407 assert_eq!(stats.robots_size, 0);
408 assert_eq!(stats.redirect_size, 0);
409 }
410
411 #[tokio::test]
412 async fn test_content_caching() {
413 let cache = CacheLayer::new();
414 let url = "https://example.com";
415
416 let mut fetch_count = 0;
417
418 let _content1 = cache.get_or_fetch_content(url, None, || async {
420 fetch_count += 1;
421 Ok(CachedContent {
422 html: "<html></html>".to_string(),
423 status_code: 200,
424 content_type: Some("text/html".to_string()),
425 headers: vec![],
426 cached_at: 0,
427 })
428 }).await.unwrap();
429
430 assert_eq!(fetch_count, 1);
431
432 let _content2 = cache.get_or_fetch_content(url, None, || async {
434 fetch_count += 1;
435 Ok(CachedContent {
436 html: "<html></html>".to_string(),
437 status_code: 200,
438 content_type: Some("text/html".to_string()),
439 headers: vec![],
440 cached_at: 0,
441 })
442 }).await.unwrap();
443
444 assert_eq!(fetch_count, 1, "Fetch function should not be called on cache hit");
445
446 let stats = cache.get_stats().await;
447 assert_eq!(stats.content_hits, 1);
448 assert_eq!(stats.content_misses, 1);
449 }
450}