Skip to main content

research_master/utils/
cache.rs

1//! Local caching for search results and other API responses.
2//!
3//! This module provides a file-based cache for storing search results,
4//! citation lookups, and other API responses to reduce network calls.
5//!
6//! # Cache Structure
7//!
8//! ```text
9//! ~/.cache/research-master/
10//!   searches/
11//!     <hash>.json
12//!   citations/
13//!     <hash>.json
14//! ```
15//!
16//! Each cached item is a JSON file containing the cached data plus metadata.
17
18use crate::config::{CacheConfig, Config};
19use crate::models::{SearchQuery, SearchResponse};
20use serde::{Deserialize, Serialize};
21use std::fs;
22use std::path::{Path, PathBuf};
23use std::time::{Duration, SystemTime};
24
25/// Cache metadata stored with each cached item
26#[derive(Debug, Clone, Serialize, Deserialize)]
27struct CacheMetadata {
28    /// When the item was cached (Unix timestamp)
29    cached_at: u64,
30
31    /// When the item expires (Unix timestamp)
32    expires_at: u64,
33
34    /// Source ID that provided this data
35    source: String,
36
37    /// Query that was executed
38    query: String,
39}
40
41/// Wrapper for cached search response
42#[derive(Debug, Clone, Serialize, Deserialize)]
43struct CachedSearchResponse {
44    /// Cache metadata
45    metadata: CacheMetadata,
46
47    /// The actual search response
48    response: SearchResponse,
49}
50
51/// Result of a cache lookup
52pub enum CacheResult<T> {
53    /// Item was found and is valid
54    Hit(T),
55
56    /// Item was not found
57    Miss,
58
59    /// Item was found but has expired
60    Expired,
61}
62
63/// Cache service for storing and retrieving cached data
64#[derive(Debug, Clone)]
65pub struct CacheService {
66    /// Base cache directory
67    base_dir: PathBuf,
68
69    /// Search cache directory
70    search_dir: PathBuf,
71
72    /// Citation cache directory
73    citation_dir: PathBuf,
74
75    /// Configuration
76    config: CacheConfig,
77}
78
79impl CacheService {
80    /// Create a new cache service with default config
81    pub fn new() -> Self {
82        Self::from_config(Config::default().cache)
83    }
84
85    /// Create a new cache service with the given config
86    pub fn from_config(config: CacheConfig) -> Self {
87        let base_dir = config
88            .directory
89            .clone()
90            .unwrap_or_else(crate::config::default_cache_dir);
91
92        let search_dir = base_dir.join("searches");
93        let citation_dir = base_dir.join("citations");
94
95        Self {
96            base_dir,
97            search_dir,
98            citation_dir,
99            config,
100        }
101    }
102
103    /// Initialize the cache directories
104    pub fn initialize(&self) -> std::io::Result<()> {
105        if self.config.enabled {
106            fs::create_dir_all(&self.search_dir)?;
107            fs::create_dir_all(&self.citation_dir)?;
108            tracing::info!("Cache initialized at: {}", self.base_dir.display());
109        } else {
110            tracing::debug!("Cache is disabled");
111        }
112        Ok(())
113    }
114
115    /// Check if caching is enabled
116    pub fn is_enabled(&self) -> bool {
117        self.config.enabled
118    }
119
120    /// Get the cache directory
121    pub fn cache_dir(&self) -> &PathBuf {
122        &self.base_dir
123    }
124
125    /// Generate a cache key for a search query
126    fn search_cache_key(
127        &self,
128        query: &str,
129        source: &str,
130        max_results: usize,
131        year: Option<&str>,
132        author: Option<&str>,
133        category: Option<&str>,
134    ) -> String {
135        let input = format!(
136            "{}|{}|{}|{}|{}|{}",
137            query,
138            source,
139            max_results,
140            year.unwrap_or_default(),
141            author.unwrap_or_default(),
142            category.unwrap_or_default()
143        );
144
145        let digest = md5::compute(input.as_bytes());
146        format!("{:x}", digest)
147    }
148
149    /// Generate a cache key for a citation lookup
150    fn citation_cache_key(&self, paper_id: &str, source: &str, max_results: usize) -> String {
151        let input = format!("{}|{}|{}", paper_id, source, max_results);
152        let digest = md5::compute(input.as_bytes());
153        format!("{:x}", digest)
154    }
155
156    /// Check if a cache entry is expired
157    fn is_expired(&self, expires_at: u64) -> bool {
158        let now = SystemTime::now()
159            .duration_since(SystemTime::UNIX_EPOCH)
160            .unwrap_or_default()
161            .as_secs();
162        now >= expires_at
163    }
164
165    /// Read a cached search response
166    pub fn get_search(&self, query: &SearchQuery, source: &str) -> CacheResult<SearchResponse> {
167        if !self.is_enabled() {
168            return CacheResult::Miss;
169        }
170
171        let key = self.search_cache_key(
172            &query.query,
173            source,
174            query.max_results,
175            query.year.as_deref(),
176            query.author.as_deref(),
177            query.category.as_deref(),
178        );
179
180        let cache_path = self.search_dir.join(&key);
181
182        match self.read_cache_file::<CachedSearchResponse>(&cache_path) {
183            Ok(cached) => {
184                if self.is_expired(cached.metadata.expires_at) {
185                    tracing::debug!("Cache expired for search: {}", key);
186                    CacheResult::Expired
187                } else {
188                    tracing::debug!("Cache HIT for search: {}", key);
189                    CacheResult::Hit(cached.response)
190                }
191            }
192            Err(_) => {
193                tracing::debug!("Cache MISS for search: {}", key);
194                CacheResult::Miss
195            }
196        }
197    }
198
199    /// Cache a search response
200    pub fn set_search(&self, source: &str, query: &SearchQuery, response: &SearchResponse) {
201        if !self.is_enabled() {
202            return;
203        }
204
205        let key = self.search_cache_key(
206            &query.query,
207            source,
208            query.max_results,
209            query.year.as_deref(),
210            query.author.as_deref(),
211            query.category.as_deref(),
212        );
213        let cache_path = self.search_dir.join(&key);
214
215        let cached = CachedSearchResponse {
216            metadata: CacheMetadata {
217                cached_at: SystemTime::now()
218                    .duration_since(SystemTime::UNIX_EPOCH)
219                    .unwrap_or_default()
220                    .as_secs(),
221                expires_at: SystemTime::now()
222                    .duration_since(SystemTime::UNIX_EPOCH)
223                    .unwrap_or_default()
224                    .as_secs()
225                    + self.config.search_ttl_seconds,
226                source: source.to_string(),
227                query: query.query.clone(),
228            },
229            response: response.clone(),
230        };
231
232        if let Err(e) = self.write_cache_file(&cache_path, &cached) {
233            tracing::warn!("Failed to cache search result: {}", e);
234        } else {
235            tracing::debug!("Cached search result: {}", key);
236        }
237    }
238
239    /// Read a cached citation lookup
240    pub fn get_citations(
241        &self,
242        paper_id: &str,
243        source: &str,
244        max_results: usize,
245    ) -> CacheResult<SearchResponse> {
246        if !self.is_enabled() {
247            return CacheResult::Miss;
248        }
249
250        let key = self.citation_cache_key(paper_id, source, max_results);
251        let cache_path = self.citation_dir.join(&key);
252
253        match self.read_cache_file::<CachedSearchResponse>(&cache_path) {
254            Ok(cached) => {
255                if self.is_expired(cached.metadata.expires_at) {
256                    tracing::debug!("Cache expired for citations: {}", key);
257                    CacheResult::Expired
258                } else {
259                    tracing::debug!("Cache HIT for citations: {}", key);
260                    CacheResult::Hit(cached.response)
261                }
262            }
263            Err(_) => {
264                tracing::debug!("Cache MISS for citations: {}", key);
265                CacheResult::Miss
266            }
267        }
268    }
269
270    /// Cache a citation lookup response
271    pub fn set_citations(&self, source: &str, paper_id: &str, response: &SearchResponse) {
272        if !self.is_enabled() {
273            return;
274        }
275
276        let key = self.citation_cache_key(paper_id, source, response.papers.len());
277        let cache_path = self.citation_dir.join(&key);
278
279        let cached = CachedSearchResponse {
280            metadata: CacheMetadata {
281                cached_at: SystemTime::now()
282                    .duration_since(SystemTime::UNIX_EPOCH)
283                    .unwrap_or_default()
284                    .as_secs(),
285                expires_at: SystemTime::now()
286                    .duration_since(SystemTime::UNIX_EPOCH)
287                    .unwrap_or_default()
288                    .as_secs()
289                    + self.config.citation_ttl_seconds,
290                source: source.to_string(),
291                query: format!("citations for {}", paper_id),
292            },
293            response: response.clone(),
294        };
295
296        if let Err(e) = self.write_cache_file(&cache_path, &cached) {
297            tracing::warn!("Failed to cache citations: {}", e);
298        } else {
299            tracing::debug!("Cached citations: {}", key);
300        }
301    }
302
303    /// Read a cached file and deserialize it
304    fn read_cache_file<T: for<'de> Deserialize<'de>>(
305        &self,
306        path: &Path,
307    ) -> Result<T, std::io::Error> {
308        let content = fs::read_to_string(path)?;
309        serde_json::from_str(&content)
310            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))
311    }
312
313    /// Serialize and write a cached file
314    fn write_cache_file<T: Serialize>(&self, path: &Path, data: &T) -> Result<(), std::io::Error> {
315        let content = serde_json::to_string_pretty(data)?;
316        fs::write(path, content)
317    }
318
319    /// Clear all cached data
320    pub fn clear_all(&self) -> std::io::Result<()> {
321        if !self.is_enabled() {
322            return Ok(());
323        }
324
325        let _ = fs::remove_dir_all(&self.base_dir);
326        self.initialize()?;
327        tracing::info!("Cache cleared");
328        Ok(())
329    }
330
331    /// Clear only search cache
332    pub fn clear_searches(&self) -> std::io::Result<()> {
333        if !self.is_enabled() {
334            return Ok(());
335        }
336
337        let _ = fs::remove_dir_all(&self.search_dir);
338        fs::create_dir_all(&self.search_dir)?;
339        tracing::info!("Search cache cleared");
340        Ok(())
341    }
342
343    /// Clear only citation cache
344    pub fn clear_citations(&self) -> std::io::Result<()> {
345        if !self.is_enabled() {
346            return Ok(());
347        }
348
349        let _ = fs::remove_dir_all(&self.citation_dir);
350        fs::create_dir_all(&self.citation_dir)?;
351        tracing::info!("Citation cache cleared");
352        Ok(())
353    }
354
355    /// Get cache statistics
356    pub fn stats(&self) -> CacheStats {
357        if !self.is_enabled() {
358            return CacheStats::disabled();
359        }
360
361        let search_count = self.search_dir.read_dir().map(|e| e.count()).unwrap_or(0);
362        let citation_count = self.citation_dir.read_dir().map(|e| e.count()).unwrap_or(0);
363
364        let search_size = self
365            .dir_size(&self.search_dir)
366            .map(|s| s / 1024)
367            .unwrap_or(0); // KB
368        let citation_size = self
369            .dir_size(&self.citation_dir)
370            .map(|s| s / 1024)
371            .unwrap_or(0); // KB
372
373        CacheStats {
374            enabled: true,
375            cache_dir: self.base_dir.clone(),
376            search_count,
377            citation_count,
378            search_size_kb: search_size,
379            citation_size_kb: citation_size,
380            total_size_kb: search_size + citation_size,
381            ttl_search: Duration::from_secs(self.config.search_ttl_seconds),
382            ttl_citations: Duration::from_secs(self.config.citation_ttl_seconds),
383        }
384    }
385
386    /// Calculate the total size of a directory
387    #[allow(clippy::only_used_in_recursion)]
388    fn dir_size(&self, path: &Path) -> Result<u64, std::io::Error> {
389        let mut size = 0;
390        if let Ok(entries) = path.read_dir() {
391            for entry in entries.flatten() {
392                size += if entry.path().is_dir() {
393                    self.dir_size(&entry.path()).unwrap_or(0)
394                } else {
395                    entry.metadata().map(|m| m.len()).unwrap_or(0)
396                };
397            }
398        }
399        Ok(size)
400    }
401}
402
403impl Default for CacheService {
404    fn default() -> Self {
405        Self::new()
406    }
407}
408
409/// Statistics about the cache
410#[derive(Debug, Clone)]
411pub struct CacheStats {
412    /// Whether caching is enabled
413    pub enabled: bool,
414
415    /// Cache directory path
416    pub cache_dir: PathBuf,
417
418    /// Number of cached search results
419    pub search_count: usize,
420
421    /// Number of cached citation lookups
422    pub citation_count: usize,
423
424    /// Size of search cache in KB
425    pub search_size_kb: u64,
426
427    /// Size of citation cache in KB
428    pub citation_size_kb: u64,
429
430    /// Total size in KB
431    pub total_size_kb: u64,
432
433    /// TTL for search results
434    pub ttl_search: Duration,
435
436    /// TTL for citation results
437    pub ttl_citations: Duration,
438}
439
440impl CacheStats {
441    /// Return stats indicating cache is disabled
442    fn disabled() -> Self {
443        Self {
444            enabled: false,
445            cache_dir: PathBuf::new(),
446            search_count: 0,
447            citation_count: 0,
448            search_size_kb: 0,
449            citation_size_kb: 0,
450            total_size_kb: 0,
451            ttl_search: Duration::ZERO,
452            ttl_citations: Duration::ZERO,
453        }
454    }
455}
456
457#[cfg(test)]
458mod tests {
459    use super::*;
460    use tempfile::TempDir;
461
462    fn test_cache_config() -> CacheConfig {
463        CacheConfig {
464            enabled: true,
465            directory: None,
466            search_ttl_seconds: 60, // 1 minute for tests
467            citation_ttl_seconds: 30,
468            max_size_mb: 10,
469        }
470    }
471
472    #[tokio::test]
473    async fn test_cache_search() {
474        let temp_dir = TempDir::new().unwrap();
475        let mut config = test_cache_config();
476        config.directory = Some(temp_dir.path().to_path_buf());
477
478        let cache = CacheService::from_config(config);
479        cache.initialize().unwrap();
480
481        let response =
482            SearchResponse::new(vec![], "test_source".to_string(), "test query".to_string());
483
484        // Create a query to use for both setting and getting cache
485        let query = SearchQuery::new("test query");
486
487        // Cache a search
488        cache.set_search("test_source", &query, &response);
489
490        // Should be a hit
491        match cache.get_search(&query, "test_source") {
492            CacheResult::Hit(r) => {
493                assert_eq!(r.source, "test_source");
494                assert_eq!(r.query, "test query");
495            }
496            _ => panic!("Expected cache hit"),
497        }
498
499        // Different query should be a miss
500        let query2 = SearchQuery::new("different query");
501        match cache.get_search(&query2, "test_source") {
502            CacheResult::Miss => {}
503            _ => panic!("Expected cache miss for different query"),
504        }
505
506        cache.clear_all().unwrap();
507    }
508
509    #[tokio::test]
510    async fn test_cache_disabled() {
511        let temp_dir = TempDir::new().unwrap();
512        let config = CacheConfig {
513            enabled: false,
514            directory: Some(temp_dir.path().to_path_buf()),
515            ..test_cache_config()
516        };
517
518        let cache = CacheService::from_config(config);
519
520        let response =
521            SearchResponse::new(vec![], "test_source".to_string(), "test query".to_string());
522
523        let query = SearchQuery::new("test query");
524
525        // Cache should be ignored when disabled
526        cache.set_search("test_source", &query, &response);
527
528        match cache.get_search(&query, "test_source") {
529            CacheResult::Miss => {}
530            _ => panic!("Expected cache miss when disabled"),
531        }
532    }
533
534    #[tokio::test]
535    async fn test_cache_expiration() {
536        let temp_dir = TempDir::new().unwrap();
537        let config = CacheConfig {
538            enabled: true,
539            directory: Some(temp_dir.path().to_path_buf()),
540            search_ttl_seconds: 0, // Immediate expiration for testing
541            citation_ttl_seconds: 0,
542            max_size_mb: 10,
543        };
544
545        let cache = CacheService::from_config(config);
546        cache.initialize().unwrap();
547
548        let response =
549            SearchResponse::new(vec![], "test_source".to_string(), "test query".to_string());
550
551        let query = SearchQuery::new("test query");
552
553        cache.set_search("test_source", &query, &response);
554
555        match cache.get_search(&query, "test_source") {
556            CacheResult::Expired => {}
557            _ => panic!("Expected cache expired"),
558        }
559    }
560}