Skip to main content

codesearch/cache/
file_meta.rs

1use anyhow::{anyhow, Result};
2use serde::{Deserialize, Serialize};
3use sha2::{Digest, Sha256};
4use std::collections::HashMap;
5use std::fs;
6use std::path::Path;
7use std::time::SystemTime;
8
9use crate::constants::FILE_META_DB_NAME;
10
11/// Normalize a file path for consistent HashMap lookups.
12///
13/// On Windows, `Path::canonicalize()` and some APIs add a UNC extended-length
14/// prefix (`\\?\C:\...`). Notify (FSW) events may use standard paths (`C:\...`).
15/// This function strips the UNC prefix and converts backslashes to forward slashes
16/// so that paths from different sources all map to the same key.
17pub fn normalize_path(path: &Path) -> String {
18    let s = path.to_string_lossy();
19    s.trim_start_matches(r"\\?\").replace('\\', "/")
20}
21
22/// Normalize a path string (same logic as `normalize_path` but for `&str` input).
23pub fn normalize_path_str(path: &str) -> String {
24    path.trim_start_matches(r"\\?\").replace('\\', "/")
25}
26
27/// Metadata for a single indexed file
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct FileMeta {
30    /// SHA256 hash of file content
31    pub hash: String,
32    /// File modification time (for quick change detection)
33    pub mtime: u64,
34    /// File size in bytes
35    pub size: u64,
36    /// Number of chunks extracted from this file
37    pub chunk_count: usize,
38    /// Chunk IDs in the vector store (for deletion on update)
39    pub chunk_ids: Vec<u32>,
40}
41
42/// Persistent store for file metadata - enables incremental indexing
43///
44/// Improvements over osgrep:
45/// 1. Two-level check: mtime first (fast), hash only if mtime changed
46/// 2. Tracks chunk IDs for efficient deletion on file update
47/// 3. Stores chunk count for statistics
48#[derive(Debug, Serialize, Deserialize)]
49pub struct FileMetaStore {
50    /// Map of absolute file path -> metadata
51    files: HashMap<String, FileMeta>,
52    /// Model used for indexing (invalidate if model changes)
53    pub model_name: String,
54    /// Dimensions of embeddings
55    pub dimensions: usize,
56    /// Last full index timestamp
57    pub last_full_index: Option<u64>,
58    /// Version for format compatibility
59    version: u32,
60}
61
62impl FileMetaStore {
63    const CURRENT_VERSION: u32 = 1;
64    const FILENAME: &'static str = FILE_META_DB_NAME;
65
66    /// Create a new empty store
67    pub fn new(model_name: String, dimensions: usize) -> Self {
68        Self {
69            files: HashMap::new(),
70            model_name,
71            dimensions,
72            last_full_index: None,
73            version: Self::CURRENT_VERSION,
74        }
75    }
76
77    /// Load from database directory, or create new if doesn't exist
78    pub fn load_or_create(db_path: &Path, model_name: &str, dimensions: usize) -> Result<Self> {
79        let meta_path = db_path.join(Self::FILENAME);
80
81        if meta_path.exists() {
82            let content = fs::read_to_string(&meta_path)?;
83            let mut store: FileMetaStore = serde_json::from_str(&content)
84                .map_err(|e| anyhow!("Failed to parse file metadata: {}", e))?;
85
86            // Check if model changed - if so, invalidate everything
87            if store.model_name != model_name || store.dimensions != dimensions {
88                println!(
89                    "⚠️  Model changed ({} -> {}), full re-index required",
90                    store.model_name, model_name
91                );
92                store = Self::new(model_name.to_string(), dimensions);
93            }
94
95            // Migrate stored paths to normalized format (strip UNC prefix, forward slashes).
96            // Existing stores may have Windows backslash paths or \\?\ prefixed paths.
97            store.migrate_paths();
98
99            Ok(store)
100        } else {
101            Ok(Self::new(model_name.to_string(), dimensions))
102        }
103    }
104
105    /// Save to database directory
106    pub fn save(&self, db_path: &Path) -> Result<()> {
107        let meta_path = db_path.join(Self::FILENAME);
108        let content = serde_json::to_string_pretty(self)?;
109        fs::write(meta_path, content)?;
110        Ok(())
111    }
112
113    /// Migrate stored paths to normalized format.
114    ///
115    /// Existing stores may have Windows backslash paths (`C:\foo\bar.rs`) or
116    /// UNC prefixed paths (`\\?\C:\foo\bar.rs`). This re-keys the HashMap
117    /// to use the canonical normalized form (forward slashes, no UNC prefix).
118    fn migrate_paths(&mut self) {
119        let old_files = std::mem::take(&mut self.files);
120        let capacity = old_files.len();
121        let mut new_files = HashMap::with_capacity(capacity);
122        let mut migrated = 0;
123
124        for (old_key, meta) in old_files {
125            let new_key = normalize_path_str(&old_key);
126            if new_key != old_key {
127                migrated += 1;
128            }
129            new_files.insert(new_key, meta);
130        }
131
132        self.files = new_files;
133
134        if migrated > 0 {
135            tracing::info!("🔄 Migrated {} file paths to normalized format", migrated);
136        }
137    }
138
139    /// Compute SHA256 hash of file content
140    pub fn compute_hash(path: &Path) -> Result<String> {
141        let content = fs::read(path)?;
142        let mut hasher = Sha256::new();
143        hasher.update(&content);
144        Ok(format!("{:x}", hasher.finalize()))
145    }
146
147    /// Get file modification time as unix timestamp
148    fn get_mtime(path: &Path) -> Result<u64> {
149        let metadata = fs::metadata(path)?;
150        let mtime = metadata.modified()?;
151        Ok(mtime.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
152    }
153
154    /// Check if a file needs re-indexing
155    /// Returns: (needs_reindex, existing_chunk_ids_to_delete)
156    pub fn check_file(&self, path: &Path) -> Result<(bool, Vec<u32>)> {
157        let path_str = normalize_path(path);
158
159        // Get current file stats
160        let current_mtime = Self::get_mtime(path)?;
161        let current_size = fs::metadata(path)?.len();
162
163        if let Some(meta) = self.files.get(&path_str) {
164            // Quick check: if mtime and size unchanged, file is unchanged
165            if meta.mtime == current_mtime && meta.size == current_size {
166                return Ok((false, vec![]));
167            }
168
169            // Mtime changed - compute hash to be sure
170            let current_hash = Self::compute_hash(path)?;
171            if meta.hash == current_hash {
172                // Content same, just update mtime
173                return Ok((false, vec![]));
174            }
175
176            // File changed - return old chunk IDs for deletion
177            Ok((true, meta.chunk_ids.clone()))
178        } else {
179            // New file
180            Ok((true, vec![]))
181        }
182    }
183
184    /// Update metadata for a file after indexing
185    pub fn update_file(&mut self, path: &Path, chunk_ids: Vec<u32>) -> Result<()> {
186        let path_str = normalize_path(path);
187        let hash = Self::compute_hash(path)?;
188        let mtime = Self::get_mtime(path)?;
189        let size = fs::metadata(path)?.len();
190
191        self.files.insert(
192            path_str,
193            FileMeta {
194                hash,
195                mtime,
196                size,
197                chunk_count: chunk_ids.len(),
198                chunk_ids,
199            },
200        );
201
202        Ok(())
203    }
204
205    /// Mark a file as deleted
206    pub fn remove_file(&mut self, path: &Path) -> Option<FileMeta> {
207        let path_str = normalize_path(path);
208        self.files.remove(&path_str)
209    }
210
211    /// Get all tracked files
212    #[allow(dead_code)] // Reserved for file listing feature
213    pub fn tracked_files(&self) -> impl Iterator<Item = &String> {
214        self.files.keys()
215    }
216
217    /// Find files that were deleted (exist in store but not on disk)
218    pub fn find_deleted_files(&self) -> Vec<(String, Vec<u32>)> {
219        self.files
220            .iter()
221            .filter(|(path, _)| !Path::new(path).exists())
222            .map(|(path, meta)| (path.clone(), meta.chunk_ids.clone()))
223            .collect()
224    }
225
226    /// Get statistics
227    #[allow(dead_code)] // Reserved for stats display
228    pub fn stats(&self) -> FileMetaStats {
229        let total_chunks: usize = self.files.values().map(|m| m.chunk_count).sum();
230        let total_size: u64 = self.files.values().map(|m| m.size).sum();
231
232        FileMetaStats {
233            total_files: self.files.len(),
234            total_chunks,
235            total_size_bytes: total_size,
236        }
237    }
238
239    /// Clear all entries (for full re-index)
240    #[allow(dead_code)] // Reserved for index reset
241    pub fn clear(&mut self) {
242        self.files.clear();
243        self.last_full_index = None;
244    }
245
246    /// Set last full index time
247    pub fn mark_full_index(&mut self) {
248        self.last_full_index = Some(
249            SystemTime::now()
250                .duration_since(SystemTime::UNIX_EPOCH)
251                .unwrap()
252                .as_secs(),
253        );
254    }
255}
256
257#[derive(Debug)]
258#[allow(dead_code)] // Used with stats() method
259pub struct FileMetaStats {
260    pub total_files: usize,
261    pub total_chunks: usize,
262    pub total_size_bytes: u64,
263}
264
265impl FileMetaStats {
266    #[allow(dead_code)] // Reserved for stats display
267    pub fn total_size_mb(&self) -> f64 {
268        self.total_size_bytes as f64 / (1024.0 * 1024.0)
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275    use tempfile::tempdir;
276
277    #[test]
278    fn test_normalize_path_strips_unc_prefix() {
279        let path = Path::new(r"\\?\C:\WorkArea\AI\codesearch\src\main.rs");
280        assert_eq!(
281            normalize_path(path),
282            "C:/WorkArea/AI/codesearch/src/main.rs"
283        );
284    }
285
286    #[test]
287    fn test_normalize_path_converts_backslashes() {
288        let path = Path::new(r"C:\WorkArea\AI\codesearch\src\main.rs");
289        assert_eq!(
290            normalize_path(path),
291            "C:/WorkArea/AI/codesearch/src/main.rs"
292        );
293    }
294
295    #[test]
296    fn test_normalize_path_forward_slashes_unchanged() {
297        let path = Path::new("C:/WorkArea/AI/codesearch/src/main.rs");
298        let result = normalize_path(path);
299        // On Windows, Path::new with forward slashes may or may not convert them
300        // The important thing is the result is consistent
301        assert!(!result.contains('\\'));
302        assert!(!result.starts_with(r"\\?\"));
303    }
304
305    #[test]
306    fn test_normalize_path_str_strips_unc() {
307        assert_eq!(normalize_path_str(r"\\?\C:\foo\bar.rs"), "C:/foo/bar.rs");
308    }
309
310    #[test]
311    fn test_normalize_path_unix_style() {
312        // Unix/Linux/macOS paths should remain unchanged
313        let path = Path::new("/home/user/project/src/main.rs");
314        assert_eq!(normalize_path(path), "/home/user/project/src/main.rs");
315    }
316
317    #[test]
318    fn test_normalize_path_mixed_separators() {
319        // Mixed separators should be normalized to forward slashes
320        let path = Path::new(r"C:\Users\project/src/lib.rs");
321        assert_eq!(normalize_path(path), "C:/Users/project/src/lib.rs");
322    }
323
324    #[test]
325    fn test_normalize_path_str_mixed_separators() {
326        assert_eq!(
327            normalize_path_str(r"C:\Users\project/src/lib.rs"),
328            "C:/Users/project/src/lib.rs"
329        );
330    }
331
332    #[test]
333    fn test_normalize_path_already_normalized() {
334        // Already normalized paths should remain unchanged
335        let path = Path::new("C:/WorkArea/AI/codesearch/src/main.rs");
336        assert_eq!(
337            normalize_path(path),
338            "C:/WorkArea/AI/codesearch/src/main.rs"
339        );
340    }
341
342    #[test]
343    fn test_normalize_path_deeply_nested() {
344        // Deeply nested paths
345        let path = Path::new(r"\\?\C:\Very\Deep\Nested\Path\To\Some\File.rs");
346        assert_eq!(
347            normalize_path(path),
348            "C:/Very/Deep/Nested/Path/To/Some/File.rs"
349        );
350    }
351
352    #[test]
353    fn test_normalize_path_consecutive_backslashes() {
354        // Consecutive backslashes (edge case from file systems)
355        let path = Path::new(r"C:\\Double\\Backslashes\\file.rs");
356        assert_eq!(normalize_path(path), "C://Double//Backslashes//file.rs");
357    }
358
359    #[test]
360    fn test_migrate_paths_normalizes_keys() {
361        let mut store = FileMetaStore::new("test-model".to_string(), 384);
362        // Insert with non-normalized key (simulating old format)
363        store.files.insert(
364            r"C:\WorkArea\src\main.rs".to_string(),
365            FileMeta {
366                hash: "abc123".to_string(),
367                mtime: 1000,
368                size: 100,
369                chunk_count: 2,
370                chunk_ids: vec![1, 2],
371            },
372        );
373        store.files.insert(
374            r"\\?\C:\WorkArea\src\lib.rs".to_string(),
375            FileMeta {
376                hash: "def456".to_string(),
377                mtime: 2000,
378                size: 200,
379                chunk_count: 3,
380                chunk_ids: vec![3, 4, 5],
381            },
382        );
383
384        store.migrate_paths();
385
386        // Both should be normalized
387        assert!(store.files.contains_key("C:/WorkArea/src/main.rs"));
388        assert!(store.files.contains_key("C:/WorkArea/src/lib.rs"));
389        // Old keys should be gone
390        assert!(!store.files.contains_key(r"C:\WorkArea\src\main.rs"));
391        assert!(!store.files.contains_key(r"\\?\C:\WorkArea\src\lib.rs"));
392    }
393
394    #[test]
395    fn test_file_meta_store() {
396        let dir = tempdir().unwrap();
397        let db_path = dir.path();
398
399        let mut store = FileMetaStore::new("test-model".to_string(), 384);
400
401        // Create a test file
402        let test_file = dir.path().join("test.txt");
403        fs::write(&test_file, "hello world").unwrap();
404
405        // Check new file
406        let (needs_reindex, old_chunks) = store.check_file(&test_file).unwrap();
407        assert!(needs_reindex);
408        assert!(old_chunks.is_empty());
409
410        // Update metadata
411        store.update_file(&test_file, vec![1, 2, 3]).unwrap();
412
413        // Check again - should not need reindex
414        let (needs_reindex, _) = store.check_file(&test_file).unwrap();
415        assert!(!needs_reindex);
416
417        // Modify file
418        fs::write(&test_file, "hello world modified").unwrap();
419
420        // Now should need reindex
421        let (needs_reindex, old_chunks) = store.check_file(&test_file).unwrap();
422        assert!(needs_reindex);
423        assert_eq!(old_chunks, vec![1, 2, 3]);
424
425        // Save and load
426        store.save(db_path).unwrap();
427        let loaded = FileMetaStore::load_or_create(db_path, "test-model", 384).unwrap();
428        assert_eq!(loaded.files.len(), 1);
429    }
430}