1use crate::hashing::Token;
10use anyhow::{Context, Result};
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet};
13use std::fs;
14use std::path::Path;
15use std::time::SystemTime;
16
17#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
19pub struct CodeLocation {
20 pub file_path: String,
22 pub start_line: usize,
24 pub end_line: usize,
26 #[serde(default)]
28 pub token_offset: Option<usize>,
29 pub token_length: usize,
31 pub tokens: Vec<Token>,
33 pub raw_source: String,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct FileCacheMetadata {
40 pub path: String,
42 pub mtime: u64,
44 pub size: u64,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct HashCache {
51 pub version: String,
53 pub min_block_size: usize,
55 pub git_commit: Option<String>,
57 pub created_at: u64,
59 pub hash_index: HashMap<u64, Vec<CodeLocation>>,
61 pub file_metadata: HashMap<String, FileCacheMetadata>,
63}
64
65impl HashCache {
66 pub fn new(min_block_size: usize) -> Self {
68 Self {
69 version: env!("CARGO_PKG_VERSION").to_string(),
70 min_block_size,
71 git_commit: get_current_git_commit(),
72 created_at: SystemTime::now()
73 .duration_since(SystemTime::UNIX_EPOCH)
74 .unwrap()
75 .as_secs(),
76 hash_index: HashMap::new(),
77 file_metadata: HashMap::new(),
78 }
79 }
80
81 pub fn add_hash(&mut self, hash: u64, location: CodeLocation) {
83 if !self.file_metadata.contains_key(&location.file_path) {
85 if let Ok(metadata) = get_file_metadata(&location.file_path) {
86 self.file_metadata
87 .insert(location.file_path.clone(), metadata);
88 }
89 }
90
91 self.hash_index.entry(hash).or_default().push(location);
92 }
93
94 pub fn lookup(&self, hash: u64) -> Option<&Vec<CodeLocation>> {
96 self.hash_index.get(&hash)
97 }
98
99 pub fn file_needs_rescan(&self, file_path: &str) -> bool {
101 match self.file_metadata.get(file_path) {
102 Some(cached_meta) => {
103 match get_file_metadata(file_path) {
105 Ok(current_meta) => {
106 cached_meta.mtime != current_meta.mtime
107 || cached_meta.size != current_meta.size
108 }
109 Err(_) => true, }
111 }
112 None => true, }
114 }
115
116 pub fn invalidate_file(&mut self, file_path: &str) {
118 self.file_metadata.remove(file_path);
120
121 for locations in self.hash_index.values_mut() {
123 locations.retain(|loc| loc.file_path != file_path);
124 }
125
126 self.hash_index.retain(|_, locations| !locations.is_empty());
128 }
129
130 pub fn invalidate_stale_files(&mut self) -> HashSet<String> {
135 let mut stale_files: HashSet<String> = self
136 .file_metadata
137 .keys()
138 .filter(|path| self.file_needs_rescan(path))
139 .cloned()
140 .collect();
141
142 for locations in self.hash_index.values() {
144 for loc in locations {
145 if !self.file_metadata.contains_key(&loc.file_path) {
146 stale_files.insert(loc.file_path.clone());
147 }
148 }
149 }
150
151 if stale_files.is_empty() {
152 return stale_files;
153 }
154
155 self.file_metadata
156 .retain(|path, _| !stale_files.contains(path));
157
158 self.hash_index.retain(|_, locations| {
159 locations.retain(|loc| !stale_files.contains(&loc.file_path));
160 !locations.is_empty()
161 });
162
163 stale_files
164 }
165
166 pub fn stats(&self) -> CacheStats {
168 let total_hashes = self.hash_index.len();
169 let total_locations: usize = self.hash_index.values().map(|v| v.len()).sum();
170 let files_cached = self.file_metadata.len();
171
172 CacheStats {
173 total_hashes,
174 total_locations,
175 files_cached,
176 created_at: self.created_at,
177 git_commit: self.git_commit.clone(),
178 }
179 }
180
181 pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
183 let json =
184 serde_json::to_string_pretty(self).context("Failed to serialize cache to JSON")?;
185 fs::write(path.as_ref(), json)
186 .with_context(|| format!("Failed to write cache to {}", path.as_ref().display()))?;
187 Ok(())
188 }
189
190 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
192 let json = fs::read_to_string(path.as_ref())
193 .with_context(|| format!("Failed to read cache from {}", path.as_ref().display()))?;
194 let cache: HashCache =
195 serde_json::from_str(&json).context("Failed to deserialize cache JSON")?;
196
197 if cache.version != env!("CARGO_PKG_VERSION") {
199 anyhow::bail!(
200 "Cache version mismatch: cache is v{}, but this is v{}. Please rebuild cache.",
201 cache.version,
202 env!("CARGO_PKG_VERSION")
203 );
204 }
205
206 Ok(cache)
207 }
208
209 pub fn is_valid<P: AsRef<Path>>(path: P) -> bool {
211 Self::load(path).is_ok()
212 }
213}
214
215impl Default for HashCache {
216 fn default() -> Self {
217 Self::new(50) }
219}
220
221#[derive(Debug, Clone)]
223pub struct CacheStats {
224 pub total_hashes: usize,
225 pub total_locations: usize,
226 pub files_cached: usize,
227 pub created_at: u64,
228 pub git_commit: Option<String>,
229}
230
231fn get_file_metadata(file_path: &str) -> Result<FileCacheMetadata> {
233 let metadata = fs::metadata(file_path)
234 .with_context(|| format!("Failed to get metadata for {}", file_path))?;
235
236 let duration = metadata
237 .modified()
238 .context("Failed to get file modification time")?
239 .duration_since(SystemTime::UNIX_EPOCH)
240 .context("File mtime is before Unix epoch")?;
241 let mtime = duration
242 .as_secs()
243 .checked_mul(1_000_000_000)
244 .and_then(|secs| secs.checked_add(u64::from(duration.subsec_nanos())))
245 .context("File mtime overflowed when converting to nanoseconds")?;
246
247 Ok(FileCacheMetadata {
248 path: file_path.to_string(),
249 mtime,
250 size: metadata.len(),
251 })
252}
253
254fn get_current_git_commit() -> Option<String> {
256 use std::process::Command;
257
258 Command::new("git")
259 .args(["rev-parse", "HEAD"])
260 .output()
261 .ok()
262 .and_then(|output| {
263 if output.status.success() {
264 String::from_utf8(output.stdout)
265 .ok()
266 .map(|s| s.trim().to_string())
267 } else {
268 None
269 }
270 })
271}
272
273#[cfg(test)]
274mod tests {
275 use super::*;
276 use crate::hashing::Token;
277 use tempfile::TempDir;
278
279 #[test]
280 fn test_cache_creation() {
281 let cache = HashCache::new(10);
282 assert_eq!(cache.version, env!("CARGO_PKG_VERSION"));
283 assert!(cache.hash_index.is_empty());
284 assert!(cache.file_metadata.is_empty());
285 }
286
287 #[test]
288 fn test_add_and_lookup() {
289 let mut cache = HashCache::new(10);
290 let location = CodeLocation {
291 file_path: "/test/file.js".to_string(),
292 start_line: 1,
293 end_line: 10,
294 token_offset: Some(0),
295 token_length: 50,
296 tokens: vec![Token::Keyword("function".to_string())],
297 raw_source: "function test() {}".to_string(),
298 };
299
300 cache.add_hash(12345, location.clone());
301
302 let results = cache.lookup(12345);
303 assert!(results.is_some());
304 assert_eq!(results.unwrap().len(), 1);
305 assert_eq!(results.unwrap()[0].file_path, "/test/file.js");
306 }
307
308 #[test]
309 fn test_save_and_load() {
310 let temp_dir = TempDir::new().unwrap();
311 let cache_path = temp_dir.path().join(".polydup-cache.json");
312
313 let mut cache = HashCache::new(10);
314 let location = CodeLocation {
315 file_path: "/test/file.js".to_string(),
316 start_line: 1,
317 end_line: 10,
318 token_offset: Some(0),
319 token_length: 50,
320 tokens: vec![Token::Keyword("function".to_string())],
321 raw_source: "function test() {}".to_string(),
322 };
323 cache.add_hash(12345, location);
324
325 cache.save(&cache_path).unwrap();
327 assert!(cache_path.exists());
328
329 let loaded = HashCache::load(&cache_path).unwrap();
331 assert_eq!(loaded.version, env!("CARGO_PKG_VERSION"));
332 assert_eq!(loaded.hash_index.len(), 1);
333 assert!(loaded.lookup(12345).is_some());
334 }
335
336 #[test]
337 fn test_cache_stats() {
338 let mut cache = HashCache::new(10);
339
340 for i in 0..5 {
341 let location = CodeLocation {
342 file_path: format!("/test/file{}.js", i),
343 start_line: 1,
344 end_line: 10,
345 token_offset: Some(0),
346 token_length: 50,
347 tokens: vec![Token::Keyword("function".to_string())],
348 raw_source: "function test() {}".to_string(),
349 };
350 cache.add_hash(i, location);
351 }
352
353 let stats = cache.stats();
354 assert_eq!(stats.total_hashes, 5);
355 assert_eq!(stats.total_locations, 5);
356 }
357
358 #[test]
359 fn test_invalidate_file() {
360 let mut cache = HashCache::new(10);
361
362 let loc1 = CodeLocation {
363 file_path: "/test/file1.js".to_string(),
364 start_line: 1,
365 end_line: 10,
366 token_offset: Some(0),
367 token_length: 50,
368 tokens: vec![Token::Keyword("function".to_string())],
369 raw_source: "function test1() {}".to_string(),
370 };
371 let loc2 = CodeLocation {
372 file_path: "/test/file2.js".to_string(),
373 start_line: 1,
374 end_line: 10,
375 token_offset: Some(0),
376 token_length: 50,
377 tokens: vec![Token::Keyword("function".to_string())],
378 raw_source: "function test2() {}".to_string(),
379 };
380
381 cache.add_hash(12345, loc1);
382 cache.add_hash(67890, loc2);
383
384 assert_eq!(cache.hash_index.len(), 2);
385
386 cache.invalidate_file("/test/file1.js");
388
389 assert_eq!(cache.hash_index.len(), 1);
390 assert!(cache.lookup(12345).is_none());
391 assert!(cache.lookup(67890).is_some());
392 }
393
394 #[test]
395 fn test_invalidate_stale_files_removes_changed_entries() {
396 use std::{thread, time::Duration};
397
398 let temp_dir = TempDir::new().unwrap();
399 let file_path = temp_dir.path().join("file.js");
400
401 std::fs::write(&file_path, "function a() { return 1; }\n").unwrap();
402
403 let mut cache = HashCache::new(3);
404 let location = CodeLocation {
405 file_path: file_path.to_string_lossy().to_string(),
406 start_line: 1,
407 end_line: 1,
408 token_offset: Some(0),
409 token_length: 3,
410 tokens: vec![Token::Keyword("function".to_string())],
411 raw_source: "function a() { return 1; }".to_string(),
412 };
413 cache.add_hash(123, location);
414
415 thread::sleep(Duration::from_secs(1));
416 std::fs::write(&file_path, "function a() { return 2; }\n").unwrap();
417
418 let removed = cache.invalidate_stale_files();
419
420 assert_eq!(removed.len(), 1);
421 assert!(removed.contains(&file_path.to_string_lossy().to_string()));
422 assert!(cache.hash_index.is_empty());
423 assert!(cache.file_metadata.is_empty());
424 }
425}