project_rag/
cache.rs

1use anyhow::{Context, Result};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::time::{SystemTime, UNIX_EPOCH};
7
8/// Information about a dirty (in-progress) indexing operation
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct DirtyInfo {
11    /// Unix timestamp when the dirty flag was set
12    pub timestamp: u64,
13    /// Number of files that were expected to be indexed (if known)
14    pub expected_files: Option<usize>,
15}
16
17impl DirtyInfo {
18    /// Create a new dirty info with the current timestamp
19    pub fn new() -> Self {
20        let timestamp = SystemTime::now()
21            .duration_since(UNIX_EPOCH)
22            .map(|d| d.as_secs())
23            .unwrap_or(0);
24        Self {
25            timestamp,
26            expected_files: None,
27        }
28    }
29
30    /// Create a new dirty info with expected file count
31    pub fn with_expected_files(expected_files: usize) -> Self {
32        let mut info = Self::new();
33        info.expected_files = Some(expected_files);
34        info
35    }
36
37    /// Check if this dirty flag is stale (older than or equal to the given duration in seconds)
38    pub fn is_stale(&self, max_age_secs: u64) -> bool {
39        let now = SystemTime::now()
40            .duration_since(UNIX_EPOCH)
41            .map(|d| d.as_secs())
42            .unwrap_or(0);
43        now.saturating_sub(self.timestamp) >= max_age_secs
44    }
45
46    /// Get the age of this dirty flag in seconds
47    pub fn age_secs(&self) -> u64 {
48        let now = SystemTime::now()
49            .duration_since(UNIX_EPOCH)
50            .map(|d| d.as_secs())
51            .unwrap_or(0);
52        now.saturating_sub(self.timestamp)
53    }
54}
55
56impl Default for DirtyInfo {
57    fn default() -> Self {
58        Self::new()
59    }
60}
61
62/// Cache for file hashes to support incremental updates
63#[derive(Debug, Clone, Serialize, Deserialize, Default)]
64pub struct HashCache {
65    /// Map of root path -> (file path -> hash)
66    pub roots: HashMap<String, HashMap<String, String>>,
67    /// Map of root paths that are currently being indexed (dirty state) with metadata
68    /// If a root is in this map, its index may be incomplete/corrupted
69    #[serde(default)]
70    pub dirty_roots: HashMap<String, DirtyInfo>,
71}
72
73/// Legacy cache format for migration (dirty_roots was a HashSet)
74#[derive(Debug, Deserialize)]
75struct LegacyHashCache {
76    roots: HashMap<String, HashMap<String, String>>,
77    #[serde(default)]
78    dirty_roots: std::collections::HashSet<String>,
79}
80
81impl HashCache {
82    /// Load cache from disk
83    /// Handles migration from old format (dirty_roots as HashSet) to new format (dirty_roots as HashMap with DirtyInfo)
84    pub fn load(cache_path: &Path) -> Result<Self> {
85        if !cache_path.exists() {
86            tracing::debug!("Cache file not found, starting with empty cache");
87            return Ok(Self::default());
88        }
89
90        let content = fs::read_to_string(cache_path).context("Failed to read cache file")?;
91
92        // Try to parse as new format first
93        if let Ok(cache) = serde_json::from_str::<HashCache>(&content) {
94            tracing::info!("Loaded cache with {} indexed roots", cache.roots.len());
95            return Ok(cache);
96        }
97
98        // Try to parse as legacy format and migrate
99        if let Ok(legacy) = serde_json::from_str::<LegacyHashCache>(&content) {
100            tracing::info!(
101                "Migrating cache from legacy format ({} roots, {} dirty roots)",
102                legacy.roots.len(),
103                legacy.dirty_roots.len()
104            );
105
106            // Migrate dirty_roots from HashSet to HashMap with default DirtyInfo
107            let dirty_roots: HashMap<String, DirtyInfo> = legacy
108                .dirty_roots
109                .into_iter()
110                .map(|root| (root, DirtyInfo::new()))
111                .collect();
112
113            let cache = HashCache {
114                roots: legacy.roots,
115                dirty_roots,
116            };
117
118            // Save the migrated cache immediately
119            if let Err(e) = cache.save(cache_path) {
120                tracing::warn!("Failed to save migrated cache: {}", e);
121            } else {
122                tracing::info!("Successfully migrated cache to new format");
123            }
124
125            return Ok(cache);
126        }
127
128        // Neither format worked
129        anyhow::bail!("Failed to parse cache file as either new or legacy format")
130    }
131
132    /// Save cache to disk
133    pub fn save(&self, cache_path: &Path) -> Result<()> {
134        // Create parent directory if it doesn't exist
135        if let Some(parent) = cache_path.parent() {
136            fs::create_dir_all(parent).context("Failed to create cache directory")?;
137        }
138
139        let content = serde_json::to_string_pretty(self).context("Failed to serialize cache")?;
140
141        fs::write(cache_path, content).context("Failed to write cache file")?;
142
143        tracing::debug!("Saved cache to {:?}", cache_path);
144        Ok(())
145    }
146
147    /// Get file hashes for a root path
148    pub fn get_root(&self, root: &str) -> Option<&HashMap<String, String>> {
149        self.roots.get(root)
150    }
151
152    /// Update file hashes for a root path
153    pub fn update_root(&mut self, root: String, hashes: HashMap<String, String>) {
154        self.roots.insert(root, hashes);
155    }
156
157    /// Remove a root path from the cache
158    pub fn remove_root(&mut self, root: &str) {
159        self.roots.remove(root);
160        self.dirty_roots.remove(root);
161    }
162
163    /// Mark a root path as dirty (indexing in progress)
164    /// This should be called BEFORE indexing starts and the cache saved immediately
165    pub fn mark_dirty(&mut self, root: &str) {
166        self.dirty_roots.insert(root.to_string(), DirtyInfo::new());
167    }
168
169    /// Mark a root path as dirty with expected file count
170    pub fn mark_dirty_with_info(&mut self, root: &str, expected_files: usize) {
171        self.dirty_roots.insert(
172            root.to_string(),
173            DirtyInfo::with_expected_files(expected_files),
174        );
175    }
176
177    /// Clear the dirty flag for a root path (indexing completed successfully)
178    /// This should be called AFTER indexing completes and the cache saved immediately
179    pub fn clear_dirty(&mut self, root: &str) {
180        self.dirty_roots.remove(root);
181    }
182
183    /// Check if a root path is marked as dirty
184    pub fn is_dirty(&self, root: &str) -> bool {
185        self.dirty_roots.contains_key(root)
186    }
187
188    /// Get dirty info for a root path
189    pub fn get_dirty_info(&self, root: &str) -> Option<&DirtyInfo> {
190        self.dirty_roots.get(root)
191    }
192
193    /// Get all dirty root paths
194    pub fn get_dirty_roots(&self) -> &HashMap<String, DirtyInfo> {
195        &self.dirty_roots
196    }
197
198    /// Check if any roots are dirty
199    pub fn has_dirty_roots(&self) -> bool {
200        !self.dirty_roots.is_empty()
201    }
202
203    /// Check if a dirty flag is stale (older than the given duration)
204    /// Returns true if the root is dirty AND the flag is older than max_age_secs
205    pub fn is_dirty_stale(&self, root: &str, max_age_secs: u64) -> bool {
206        self.dirty_roots
207            .get(root)
208            .is_some_and(|info| info.is_stale(max_age_secs))
209    }
210
211    /// Get the age of a dirty flag in seconds, or None if not dirty
212    pub fn dirty_age_secs(&self, root: &str) -> Option<u64> {
213        self.dirty_roots.get(root).map(|info| info.age_secs())
214    }
215
216    /// Clear stale dirty flags (older than max_age_secs)
217    /// Returns the number of stale flags cleared
218    pub fn clear_stale_dirty_flags(&mut self, max_age_secs: u64) -> usize {
219        let stale_roots: Vec<String> = self
220            .dirty_roots
221            .iter()
222            .filter(|(_, info)| info.is_stale(max_age_secs))
223            .map(|(root, _)| root.clone())
224            .collect();
225
226        let count = stale_roots.len();
227        for root in stale_roots {
228            tracing::info!(
229                "Clearing stale dirty flag for '{}' (age: {} seconds)",
230                root,
231                self.dirty_roots
232                    .get(&root)
233                    .map(|i| i.age_secs())
234                    .unwrap_or(0)
235            );
236            self.dirty_roots.remove(&root);
237        }
238        count
239    }
240
241    /// Get default cache path (in user's cache directory)
242    pub fn default_path() -> PathBuf {
243        crate::paths::PlatformPaths::default_hash_cache_path()
244    }
245}
246
247#[cfg(test)]
248mod tests {
249    use super::*;
250    use tempfile::NamedTempFile;
251
252    #[test]
253    fn test_cache_serialization() {
254        let mut cache = HashCache::default();
255        let mut hashes = HashMap::new();
256        hashes.insert("file1.rs".to_string(), "hash1".to_string());
257        hashes.insert("file2.rs".to_string(), "hash2".to_string());
258        cache.update_root("/test/path".to_string(), hashes);
259
260        let json = serde_json::to_string(&cache).unwrap();
261        let deserialized: HashCache = serde_json::from_str(&json).unwrap();
262
263        assert_eq!(cache.roots.len(), deserialized.roots.len());
264        assert_eq!(
265            cache.roots.get("/test/path"),
266            deserialized.roots.get("/test/path")
267        );
268    }
269
270    #[test]
271    fn test_cache_save_load() {
272        let temp_file = NamedTempFile::new().unwrap();
273        let cache_path = temp_file.path().to_path_buf();
274
275        // Create and save cache
276        let mut cache = HashCache::default();
277        let mut hashes = HashMap::new();
278        hashes.insert("file1.rs".to_string(), "hash1".to_string());
279        cache.update_root("/test/path".to_string(), hashes);
280
281        cache.save(&cache_path).unwrap();
282
283        // Load cache
284        let loaded = HashCache::load(&cache_path).unwrap();
285        assert_eq!(cache.roots.len(), loaded.roots.len());
286        assert_eq!(
287            cache.roots.get("/test/path"),
288            loaded.roots.get("/test/path")
289        );
290    }
291
292    #[test]
293    fn test_cache_operations() {
294        let mut cache = HashCache::default();
295
296        // Update root
297        let mut hashes = HashMap::new();
298        hashes.insert("file1.rs".to_string(), "hash1".to_string());
299        cache.update_root("/test/path".to_string(), hashes);
300
301        // Get root
302        assert!(cache.get_root("/test/path").is_some());
303        assert!(cache.get_root("/nonexistent").is_none());
304
305        // Remove root
306        cache.remove_root("/test/path");
307        assert!(cache.get_root("/test/path").is_none());
308    }
309
310    #[test]
311    fn test_load_nonexistent_cache() {
312        let result = HashCache::load(Path::new("/nonexistent/path/cache.json"));
313        assert!(result.is_ok());
314        assert_eq!(result.unwrap().roots.len(), 0);
315    }
316
317    #[test]
318    fn test_load_corrupted_cache() {
319        let temp_file = NamedTempFile::new().unwrap();
320        let cache_path = temp_file.path().to_path_buf();
321
322        // Write invalid JSON
323        fs::write(&cache_path, "{ invalid json }").unwrap();
324
325        let result = HashCache::load(&cache_path);
326        assert!(result.is_err());
327    }
328
329    #[test]
330    fn test_save_creates_parent_directory() {
331        let temp_dir = tempfile::tempdir().unwrap();
332        let cache_path = temp_dir.path().join("subdir").join("cache.json");
333
334        let cache = HashCache::default();
335        cache.save(&cache_path).unwrap();
336
337        assert!(cache_path.exists());
338    }
339
340    #[test]
341    fn test_default_path() {
342        let path = HashCache::default_path();
343        assert!(path.to_string_lossy().contains("project-rag"));
344        assert!(path.to_string_lossy().contains("hash_cache.json"));
345    }
346
347    #[test]
348    fn test_update_root_replaces_existing() {
349        let mut cache = HashCache::default();
350
351        // Add first set of hashes
352        let mut hashes1 = HashMap::new();
353        hashes1.insert("file1.rs".to_string(), "hash1".to_string());
354        cache.update_root("/test/path".to_string(), hashes1);
355
356        // Replace with new set
357        let mut hashes2 = HashMap::new();
358        hashes2.insert("file2.rs".to_string(), "hash2".to_string());
359        cache.update_root("/test/path".to_string(), hashes2);
360
361        let root_hashes = cache.get_root("/test/path").unwrap();
362        assert_eq!(root_hashes.len(), 1);
363        assert!(root_hashes.contains_key("file2.rs"));
364        assert!(!root_hashes.contains_key("file1.rs"));
365    }
366
367    #[test]
368    fn test_multiple_roots() {
369        let mut cache = HashCache::default();
370
371        let mut hashes1 = HashMap::new();
372        hashes1.insert("file1.rs".to_string(), "hash1".to_string());
373        cache.update_root("/path1".to_string(), hashes1);
374
375        let mut hashes2 = HashMap::new();
376        hashes2.insert("file2.rs".to_string(), "hash2".to_string());
377        cache.update_root("/path2".to_string(), hashes2);
378
379        assert_eq!(cache.roots.len(), 2);
380        assert!(cache.get_root("/path1").is_some());
381        assert!(cache.get_root("/path2").is_some());
382    }
383
384    #[test]
385    fn test_empty_cache_operations() {
386        let cache = HashCache::default();
387        assert!(cache.get_root("/any/path").is_none());
388        assert_eq!(cache.roots.len(), 0);
389    }
390
391    #[test]
392    fn test_remove_root_nonexistent() {
393        let mut cache = HashCache::default();
394        cache.remove_root("/nonexistent");
395        // Should not panic
396        assert_eq!(cache.roots.len(), 0);
397    }
398
399    #[test]
400    fn test_dirty_flag_operations() {
401        let mut cache = HashCache::default();
402
403        // Initially not dirty
404        assert!(!cache.is_dirty("/test/path"));
405        assert!(!cache.has_dirty_roots());
406        assert!(cache.get_dirty_roots().is_empty());
407
408        // Mark as dirty
409        cache.mark_dirty("/test/path");
410        assert!(cache.is_dirty("/test/path"));
411        assert!(cache.has_dirty_roots());
412        assert!(cache.get_dirty_roots().contains_key("/test/path"));
413
414        // Verify dirty info has timestamp
415        let info = cache.get_dirty_info("/test/path").unwrap();
416        assert!(info.timestamp > 0);
417        assert!(info.expected_files.is_none());
418
419        // Clear dirty flag
420        cache.clear_dirty("/test/path");
421        assert!(!cache.is_dirty("/test/path"));
422        assert!(!cache.has_dirty_roots());
423    }
424
425    #[test]
426    fn test_dirty_flag_with_expected_files() {
427        let mut cache = HashCache::default();
428
429        cache.mark_dirty_with_info("/test/path", 100);
430        assert!(cache.is_dirty("/test/path"));
431
432        let info = cache.get_dirty_info("/test/path").unwrap();
433        assert_eq!(info.expected_files, Some(100));
434    }
435
436    #[test]
437    fn test_dirty_flag_staleness() {
438        let mut cache = HashCache::default();
439
440        cache.mark_dirty("/test/path");
441
442        // A fresh dirty flag should not be stale (with a 1 hour timeout)
443        assert!(!cache.is_dirty_stale("/test/path", 3600));
444
445        // Get the age (should be very small, just created)
446        let age = cache.dirty_age_secs("/test/path").unwrap();
447        assert!(age < 5); // Should be less than 5 seconds
448
449        // With a 0 second timeout, it should be considered stale
450        assert!(cache.is_dirty_stale("/test/path", 0));
451    }
452
453    #[test]
454    fn test_clear_stale_dirty_flags() {
455        let mut cache = HashCache::default();
456
457        cache.mark_dirty("/path1");
458        cache.mark_dirty("/path2");
459
460        // With 0 second timeout, all should be cleared
461        let cleared = cache.clear_stale_dirty_flags(0);
462        assert_eq!(cleared, 2);
463        assert!(!cache.has_dirty_roots());
464    }
465
466    #[test]
467    fn test_dirty_flag_persistence() {
468        let temp_file = NamedTempFile::new().unwrap();
469        let cache_path = temp_file.path().to_path_buf();
470
471        // Create cache with dirty flag
472        let mut cache = HashCache::default();
473        cache.mark_dirty("/test/path");
474        cache.save(&cache_path).unwrap();
475
476        // Load and verify dirty flag persisted
477        let loaded = HashCache::load(&cache_path).unwrap();
478        assert!(loaded.is_dirty("/test/path"));
479        assert!(loaded.has_dirty_roots());
480    }
481
482    #[test]
483    fn test_remove_root_clears_dirty() {
484        let mut cache = HashCache::default();
485
486        // Add root with files and mark as dirty
487        let mut hashes = HashMap::new();
488        hashes.insert("file1.rs".to_string(), "hash1".to_string());
489        cache.update_root("/test/path".to_string(), hashes);
490        cache.mark_dirty("/test/path");
491
492        assert!(cache.is_dirty("/test/path"));
493        assert!(cache.get_root("/test/path").is_some());
494
495        // Remove root - should also clear dirty
496        cache.remove_root("/test/path");
497        assert!(!cache.is_dirty("/test/path"));
498        assert!(cache.get_root("/test/path").is_none());
499    }
500
501    #[test]
502    fn test_multiple_dirty_roots() {
503        let mut cache = HashCache::default();
504
505        cache.mark_dirty("/path1");
506        cache.mark_dirty("/path2");
507        cache.mark_dirty("/path3");
508
509        assert!(cache.is_dirty("/path1"));
510        assert!(cache.is_dirty("/path2"));
511        assert!(cache.is_dirty("/path3"));
512        assert_eq!(cache.get_dirty_roots().len(), 3);
513
514        cache.clear_dirty("/path2");
515        assert!(cache.is_dirty("/path1"));
516        assert!(!cache.is_dirty("/path2"));
517        assert!(cache.is_dirty("/path3"));
518        assert_eq!(cache.get_dirty_roots().len(), 2);
519    }
520
521    #[test]
522    fn test_dirty_flag_idempotent() {
523        let mut cache = HashCache::default();
524
525        // Marking same path multiple times should be idempotent
526        cache.mark_dirty("/test/path");
527        cache.mark_dirty("/test/path");
528        cache.mark_dirty("/test/path");
529        assert_eq!(cache.get_dirty_roots().len(), 1);
530
531        // Clearing same path multiple times should be safe
532        cache.clear_dirty("/test/path");
533        cache.clear_dirty("/test/path");
534        assert!(!cache.is_dirty("/test/path"));
535    }
536
537    #[test]
538    fn test_dirty_flag_with_old_cache_format() {
539        // Test that loading a cache without dirty_roots field works (backwards compatibility)
540        let temp_file = NamedTempFile::new().unwrap();
541        let cache_path = temp_file.path().to_path_buf();
542
543        // Write old format JSON (without dirty_roots)
544        let old_format = r#"{"roots":{"/test/path":{"file1.rs":"hash1"}}}"#;
545        fs::write(&cache_path, old_format).unwrap();
546
547        // Load should succeed with empty dirty_roots
548        let loaded = HashCache::load(&cache_path).unwrap();
549        assert!(loaded.get_root("/test/path").is_some());
550        assert!(!loaded.has_dirty_roots());
551        assert!(!loaded.is_dirty("/test/path"));
552    }
553
554    #[test]
555    fn test_dirty_flag_migration_from_hashset() {
556        // Test that loading a cache with old HashSet dirty_roots format works
557        // This handles migration from the old format (HashSet<String>) to new (HashMap<String, DirtyInfo>)
558        let temp_file = NamedTempFile::new().unwrap();
559        let cache_path = temp_file.path().to_path_buf();
560
561        // Write old format JSON with HashSet dirty_roots (array format)
562        let old_format =
563            r#"{"roots":{"/test/path":{"file1.rs":"hash1"}},"dirty_roots":["/test/path"]}"#;
564        fs::write(&cache_path, old_format).unwrap();
565
566        // Load should successfully migrate the old format
567        let loaded = HashCache::load(&cache_path).unwrap();
568
569        // Verify the migration worked
570        assert!(loaded.get_root("/test/path").is_some());
571        assert!(loaded.is_dirty("/test/path"));
572        assert!(loaded.has_dirty_roots());
573
574        // Verify the dirty info has a timestamp
575        let info = loaded.get_dirty_info("/test/path").unwrap();
576        assert!(info.timestamp > 0);
577
578        // Verify the file was updated to new format
579        let reloaded = HashCache::load(&cache_path).unwrap();
580        assert!(reloaded.is_dirty("/test/path"));
581    }
582
583    #[test]
584    fn test_dirty_info_default() {
585        let info = DirtyInfo::default();
586        assert!(info.timestamp > 0);
587        assert!(info.expected_files.is_none());
588    }
589
590    #[test]
591    fn test_dirty_info_with_expected_files() {
592        let info = DirtyInfo::with_expected_files(50);
593        assert!(info.timestamp > 0);
594        assert_eq!(info.expected_files, Some(50));
595    }
596}