dupe_core/
cache.rs

1//! Hash cache for incremental duplicate detection
2//!
3//! This module implements a persistent hash cache that stores rolling hashes
4//! for all functions in the codebase. This enables:
5//! - Fast git-diff mode: scan only changed files, lookup against cached hashes
6//! - Incremental scanning: only rescan files that changed
7//! - 10-100x speedup for large codebases
8
9use crate::hashing::Token;
10use anyhow::{Context, Result};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::fs;
14use std::path::Path;
15use std::time::SystemTime;
16
17/// Location of a code block in the codebase
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
19pub struct CodeLocation {
20    /// Absolute path to the file
21    pub file_path: String,
22    /// Starting line number (1-indexed)
23    pub start_line: usize,
24    /// Ending line number (1-indexed)
25    pub end_line: usize,
26    /// Token offset of this window within the function
27    #[serde(default)]
28    pub token_offset: Option<usize>,
29    /// Length in tokens
30    pub token_length: usize,
31    /// The normalized token sequence (for similarity calculation)
32    pub tokens: Vec<Token>,
33    /// Raw source code (for Type-1 detection)
34    pub raw_source: String,
35}
36
37/// Metadata about a cached file
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct FileCacheMetadata {
40    /// Absolute path to the file
41    pub path: String,
42    /// Last modification time in nanoseconds since Unix epoch
43    pub mtime: u64,
44    /// File size in bytes
45    pub size: u64,
46}
47
48/// The complete hash cache for a codebase
49#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HashCache {
51    /// Version of the cache format (for future compatibility)
52    pub version: String,
53    /// Minimum block size (tokens) used to build this cache
54    pub min_block_size: usize,
55    /// Git commit hash when cache was built (if available)
56    pub git_commit: Option<String>,
57    /// Timestamp when cache was created
58    pub created_at: u64,
59    /// Map from rolling hash to all locations with that hash
60    pub hash_index: HashMap<u64, Vec<CodeLocation>>,
61    /// Metadata for all cached files (for invalidation)
62    pub file_metadata: HashMap<String, FileCacheMetadata>,
63}
64
65impl HashCache {
66    /// Create a new empty cache
67    pub fn new(min_block_size: usize) -> Self {
68        Self {
69            version: env!("CARGO_PKG_VERSION").to_string(),
70            min_block_size,
71            git_commit: get_current_git_commit(),
72            created_at: SystemTime::now()
73                .duration_since(SystemTime::UNIX_EPOCH)
74                .unwrap()
75                .as_secs(),
76            hash_index: HashMap::new(),
77            file_metadata: HashMap::new(),
78        }
79    }
80
81    /// Add a hash entry to the cache
82    pub fn add_hash(&mut self, hash: u64, location: CodeLocation) {
83        // Also store file metadata for cache invalidation
84        if !self.file_metadata.contains_key(&location.file_path) {
85            if let Ok(metadata) = get_file_metadata(&location.file_path) {
86                self.file_metadata
87                    .insert(location.file_path.clone(), metadata);
88            }
89        }
90
91        self.hash_index.entry(hash).or_default().push(location);
92    }
93
94    /// Look up all locations with a given hash
95    pub fn lookup(&self, hash: u64) -> Option<&Vec<CodeLocation>> {
96        self.hash_index.get(&hash)
97    }
98
99    /// Check if a file needs to be rescanned (has changed since cache was built)
100    pub fn file_needs_rescan(&self, file_path: &str) -> bool {
101        match self.file_metadata.get(file_path) {
102            Some(cached_meta) => {
103                // Check if file still exists and hasn't changed
104                match get_file_metadata(file_path) {
105                    Ok(current_meta) => {
106                        cached_meta.mtime != current_meta.mtime
107                            || cached_meta.size != current_meta.size
108                    }
109                    Err(_) => true, // File deleted or inaccessible
110                }
111            }
112            None => true, // File not in cache
113        }
114    }
115
116    /// Remove all cache entries for a specific file
117    pub fn invalidate_file(&mut self, file_path: &str) {
118        // Remove from metadata
119        self.file_metadata.remove(file_path);
120
121        // Remove all hash entries for this file
122        for locations in self.hash_index.values_mut() {
123            locations.retain(|loc| loc.file_path != file_path);
124        }
125
126        // Clean up empty hash entries
127        self.hash_index.retain(|_, locations| !locations.is_empty());
128    }
129
130    /// Drop cache entries for files whose metadata no longer matches disk.
131    ///
132    /// Returns the set of file paths that were removed so callers can refresh
133    /// the cache entries when needed.
134    pub fn invalidate_stale_files(&mut self) -> HashSet<String> {
135        let mut stale_files: HashSet<String> = self
136            .file_metadata
137            .keys()
138            .filter(|path| self.file_needs_rescan(path))
139            .cloned()
140            .collect();
141
142        // Defensive: if a cache entry exists without metadata, treat it as stale
143        for locations in self.hash_index.values() {
144            for loc in locations {
145                if !self.file_metadata.contains_key(&loc.file_path) {
146                    stale_files.insert(loc.file_path.clone());
147                }
148            }
149        }
150
151        if stale_files.is_empty() {
152            return stale_files;
153        }
154
155        self.file_metadata
156            .retain(|path, _| !stale_files.contains(path));
157
158        self.hash_index.retain(|_, locations| {
159            locations.retain(|loc| !stale_files.contains(&loc.file_path));
160            !locations.is_empty()
161        });
162
163        stale_files
164    }
165
166    /// Get cache statistics
167    pub fn stats(&self) -> CacheStats {
168        let total_hashes = self.hash_index.len();
169        let total_locations: usize = self.hash_index.values().map(|v| v.len()).sum();
170        let files_cached = self.file_metadata.len();
171
172        CacheStats {
173            total_hashes,
174            total_locations,
175            files_cached,
176            created_at: self.created_at,
177            git_commit: self.git_commit.clone(),
178        }
179    }
180
181    /// Save cache to disk
182    pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
183        let json =
184            serde_json::to_string_pretty(self).context("Failed to serialize cache to JSON")?;
185        fs::write(path.as_ref(), json)
186            .with_context(|| format!("Failed to write cache to {}", path.as_ref().display()))?;
187        Ok(())
188    }
189
190    /// Load cache from disk
191    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
192        let json = fs::read_to_string(path.as_ref())
193            .with_context(|| format!("Failed to read cache from {}", path.as_ref().display()))?;
194        let cache: HashCache =
195            serde_json::from_str(&json).context("Failed to deserialize cache JSON")?;
196
197        // Version check
198        if cache.version != env!("CARGO_PKG_VERSION") {
199            anyhow::bail!(
200                "Cache version mismatch: cache is v{}, but this is v{}. Please rebuild cache.",
201                cache.version,
202                env!("CARGO_PKG_VERSION")
203            );
204        }
205
206        Ok(cache)
207    }
208
209    /// Check if cache exists and is valid
210    pub fn is_valid<P: AsRef<Path>>(path: P) -> bool {
211        Self::load(path).is_ok()
212    }
213}
214
215impl Default for HashCache {
216    fn default() -> Self {
217        Self::new(50) // Use default threshold
218    }
219}
220
221/// Cache statistics for reporting
222#[derive(Debug, Clone)]
223pub struct CacheStats {
224    pub total_hashes: usize,
225    pub total_locations: usize,
226    pub files_cached: usize,
227    pub created_at: u64,
228    pub git_commit: Option<String>,
229}
230
231/// Get file metadata for cache invalidation
232fn get_file_metadata(file_path: &str) -> Result<FileCacheMetadata> {
233    let metadata = fs::metadata(file_path)
234        .with_context(|| format!("Failed to get metadata for {}", file_path))?;
235
236    let duration = metadata
237        .modified()
238        .context("Failed to get file modification time")?
239        .duration_since(SystemTime::UNIX_EPOCH)
240        .context("File mtime is before Unix epoch")?;
241    let mtime = duration
242        .as_secs()
243        .checked_mul(1_000_000_000)
244        .and_then(|secs| secs.checked_add(u64::from(duration.subsec_nanos())))
245        .context("File mtime overflowed when converting to nanoseconds")?;
246
247    Ok(FileCacheMetadata {
248        path: file_path.to_string(),
249        mtime,
250        size: metadata.len(),
251    })
252}
253
254/// Get current git commit hash (if in a git repository)
255fn get_current_git_commit() -> Option<String> {
256    use std::process::Command;
257
258    Command::new("git")
259        .args(["rev-parse", "HEAD"])
260        .output()
261        .ok()
262        .and_then(|output| {
263            if output.status.success() {
264                String::from_utf8(output.stdout)
265                    .ok()
266                    .map(|s| s.trim().to_string())
267            } else {
268                None
269            }
270        })
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276    use crate::hashing::Token;
277    use tempfile::TempDir;
278
279    #[test]
280    fn test_cache_creation() {
281        let cache = HashCache::new(10);
282        assert_eq!(cache.version, env!("CARGO_PKG_VERSION"));
283        assert!(cache.hash_index.is_empty());
284        assert!(cache.file_metadata.is_empty());
285    }
286
287    #[test]
288    fn test_add_and_lookup() {
289        let mut cache = HashCache::new(10);
290        let location = CodeLocation {
291            file_path: "/test/file.js".to_string(),
292            start_line: 1,
293            end_line: 10,
294            token_offset: Some(0),
295            token_length: 50,
296            tokens: vec![Token::Keyword("function".to_string())],
297            raw_source: "function test() {}".to_string(),
298        };
299
300        cache.add_hash(12345, location.clone());
301
302        let results = cache.lookup(12345);
303        assert!(results.is_some());
304        assert_eq!(results.unwrap().len(), 1);
305        assert_eq!(results.unwrap()[0].file_path, "/test/file.js");
306    }
307
308    #[test]
309    fn test_save_and_load() {
310        let temp_dir = TempDir::new().unwrap();
311        let cache_path = temp_dir.path().join(".polydup-cache.json");
312
313        let mut cache = HashCache::new(10);
314        let location = CodeLocation {
315            file_path: "/test/file.js".to_string(),
316            start_line: 1,
317            end_line: 10,
318            token_offset: Some(0),
319            token_length: 50,
320            tokens: vec![Token::Keyword("function".to_string())],
321            raw_source: "function test() {}".to_string(),
322        };
323        cache.add_hash(12345, location);
324
325        // Save
326        cache.save(&cache_path).unwrap();
327        assert!(cache_path.exists());
328
329        // Load
330        let loaded = HashCache::load(&cache_path).unwrap();
331        assert_eq!(loaded.version, env!("CARGO_PKG_VERSION"));
332        assert_eq!(loaded.hash_index.len(), 1);
333        assert!(loaded.lookup(12345).is_some());
334    }
335
336    #[test]
337    fn test_cache_stats() {
338        let mut cache = HashCache::new(10);
339
340        for i in 0..5 {
341            let location = CodeLocation {
342                file_path: format!("/test/file{}.js", i),
343                start_line: 1,
344                end_line: 10,
345                token_offset: Some(0),
346                token_length: 50,
347                tokens: vec![Token::Keyword("function".to_string())],
348                raw_source: "function test() {}".to_string(),
349            };
350            cache.add_hash(i, location);
351        }
352
353        let stats = cache.stats();
354        assert_eq!(stats.total_hashes, 5);
355        assert_eq!(stats.total_locations, 5);
356    }
357
358    #[test]
359    fn test_invalidate_file() {
360        let mut cache = HashCache::new(10);
361
362        let loc1 = CodeLocation {
363            file_path: "/test/file1.js".to_string(),
364            start_line: 1,
365            end_line: 10,
366            token_offset: Some(0),
367            token_length: 50,
368            tokens: vec![Token::Keyword("function".to_string())],
369            raw_source: "function test1() {}".to_string(),
370        };
371        let loc2 = CodeLocation {
372            file_path: "/test/file2.js".to_string(),
373            start_line: 1,
374            end_line: 10,
375            token_offset: Some(0),
376            token_length: 50,
377            tokens: vec![Token::Keyword("function".to_string())],
378            raw_source: "function test2() {}".to_string(),
379        };
380
381        cache.add_hash(12345, loc1);
382        cache.add_hash(67890, loc2);
383
384        assert_eq!(cache.hash_index.len(), 2);
385
386        // Invalidate file1
387        cache.invalidate_file("/test/file1.js");
388
389        assert_eq!(cache.hash_index.len(), 1);
390        assert!(cache.lookup(12345).is_none());
391        assert!(cache.lookup(67890).is_some());
392    }
393
394    #[test]
395    fn test_invalidate_stale_files_removes_changed_entries() {
396        use std::{thread, time::Duration};
397
398        let temp_dir = TempDir::new().unwrap();
399        let file_path = temp_dir.path().join("file.js");
400
401        std::fs::write(&file_path, "function a() { return 1; }\n").unwrap();
402
403        let mut cache = HashCache::new(3);
404        let location = CodeLocation {
405            file_path: file_path.to_string_lossy().to_string(),
406            start_line: 1,
407            end_line: 1,
408            token_offset: Some(0),
409            token_length: 3,
410            tokens: vec![Token::Keyword("function".to_string())],
411            raw_source: "function a() { return 1; }".to_string(),
412        };
413        cache.add_hash(123, location);
414
415        thread::sleep(Duration::from_secs(1));
416        std::fs::write(&file_path, "function a() { return 2; }\n").unwrap();
417
418        let removed = cache.invalidate_stale_files();
419
420        assert_eq!(removed.len(), 1);
421        assert!(removed.contains(&file_path.to_string_lossy().to_string()));
422        assert!(cache.hash_index.is_empty());
423        assert!(cache.file_metadata.is_empty());
424    }
425}