codeprysm_core/
merkle.rs

1//! Merkle Tree Manager for Code Graph Incremental Updates
2//!
3//! This module provides efficient file change detection using content hashing
4//! for the code graph generation system.
5
6use ignore::WalkBuilder;
7use rayon::prelude::*;
8use sha2::{Digest, Sha256};
9use std::collections::{HashMap, HashSet};
10use std::fs::File;
11use std::io::{BufReader, Read};
12use std::path::{Path, PathBuf};
13use thiserror::Error;
14use tracing::{debug, info, warn};
15
16/// Errors that can occur during Merkle tree operations.
17#[derive(Error, Debug)]
18pub enum MerkleError {
19    #[error("IO error: {0}")]
20    Io(#[from] std::io::Error),
21
22    #[error("JSON error: {0}")]
23    Json(#[from] serde_json::Error),
24
25    #[error("Repository path does not exist: {0}")]
26    RepoNotFound(PathBuf),
27
28    #[error("Failed to hash file {path}: {reason}")]
29    HashError { path: PathBuf, reason: String },
30}
31
32/// Result type for Merkle tree operations.
33pub type Result<T> = std::result::Result<T, MerkleError>;
34
35/// Represents detected changes between two Merkle trees.
36#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
37pub struct ChangeSet {
38    /// Files that were modified (content changed).
39    pub modified: Vec<String>,
40    /// Files that were added.
41    pub added: Vec<String>,
42    /// Files that were deleted.
43    pub deleted: Vec<String>,
44}
45
46impl ChangeSet {
47    /// Create a new empty ChangeSet.
48    pub fn new() -> Self {
49        Self::default()
50    }
51
52    /// Check if any changes were detected.
53    pub fn has_changes(&self) -> bool {
54        !self.modified.is_empty() || !self.added.is_empty() || !self.deleted.is_empty()
55    }
56
57    /// Total number of changed files.
58    pub fn total_changes(&self) -> usize {
59        self.modified.len() + self.added.len() + self.deleted.len()
60    }
61
62    /// Get all files that need to be re-processed (modified + added).
63    pub fn files_to_process(&self) -> Vec<&str> {
64        self.modified
65            .iter()
66            .chain(self.added.iter())
67            .map(|s| s.as_str())
68            .collect()
69    }
70}
71
72/// Default file extensions to always exclude.
73const DEFAULT_EXCLUDE_EXTENSIONS: &[&str] = &[
74    "*.pyc",
75    "*.pyo",
76    "*.jpg",
77    "*.jpeg",
78    "*.png",
79    "*.gif",
80    "*.bmp",
81    "*.ico",
82    "*.svg",
83    "*.pdf",
84    "*.zip",
85    "*.tar",
86    "*.gz",
87    "*.rar",
88    "*.7z",
89    "*.exe",
90    "*.dll",
91    "*.so",
92    "*.dylib",
93    "*.o",
94    "*.a",
95    "*.lib",
96    "*.class",
97    "*.jar",
98    "*.war",
99    "*.whl",
100    "*.egg",
101    "*.db",
102    "*.sqlite",
103    "*.sqlite3",
104];
105
106/// Default directories to always exclude.
107const DEFAULT_EXCLUDE_DIRS: &[&str] = &[
108    ".git",
109    ".DS_Store",
110    "__pycache__",
111    "node_modules",
112    ".venv",
113    "venv",
114    ".env",
115    "target",
116    "build",
117    "dist",
118    ".idea",
119    ".vscode",
120];
121
122/// Filter for determining which files should be excluded from processing.
123#[derive(Debug, Clone)]
124pub struct ExclusionFilter {
125    /// Glob patterns for files to exclude.
126    exclude_patterns: Vec<glob::Pattern>,
127    /// Directory names to exclude.
128    exclude_dirs: HashSet<String>,
129    /// Whether to exclude hidden files/directories.
130    exclude_hidden: bool,
131}
132
133impl Default for ExclusionFilter {
134    fn default() -> Self {
135        Self::new(None, true)
136    }
137}
138
139impl ExclusionFilter {
140    /// Create a new exclusion filter with optional custom patterns.
141    ///
142    /// # Arguments
143    /// * `custom_patterns` - Additional glob patterns to exclude
144    /// * `exclude_hidden` - Whether to exclude hidden files/directories (starting with '.')
145    pub fn new(custom_patterns: Option<&[&str]>, exclude_hidden: bool) -> Self {
146        let mut patterns = Vec::new();
147
148        // Add default extension patterns
149        for ext in DEFAULT_EXCLUDE_EXTENSIONS {
150            if let Ok(p) = glob::Pattern::new(ext) {
151                patterns.push(p);
152            }
153        }
154
155        // Add custom patterns
156        if let Some(custom) = custom_patterns {
157            for pattern in custom {
158                if let Ok(p) = glob::Pattern::new(pattern) {
159                    patterns.push(p);
160                }
161            }
162        }
163
164        // Build directory exclusion set
165        let exclude_dirs: HashSet<String> =
166            DEFAULT_EXCLUDE_DIRS.iter().map(|s| s.to_string()).collect();
167
168        Self {
169            exclude_patterns: patterns,
170            exclude_dirs,
171            exclude_hidden,
172        }
173    }
174
175    /// Check if a path should be excluded.
176    pub fn should_exclude(&self, path: &Path) -> bool {
177        let path_str = path.to_string_lossy();
178
179        // Check for hidden files/directories
180        if self.exclude_hidden {
181            for component in path.components() {
182                if let std::path::Component::Normal(name) = component {
183                    if let Some(name_str) = name.to_str() {
184                        if name_str.starts_with('.') && name_str != "." && name_str != ".." {
185                            return true;
186                        }
187                    }
188                }
189            }
190        }
191
192        // Check directory exclusions
193        for component in path.components() {
194            if let std::path::Component::Normal(name) = component {
195                if let Some(name_str) = name.to_str() {
196                    if self.exclude_dirs.contains(name_str) {
197                        return true;
198                    }
199                }
200            }
201        }
202
203        // Check glob patterns against filename
204        if let Some(filename) = path.file_name() {
205            let filename_str = filename.to_string_lossy();
206            for pattern in &self.exclude_patterns {
207                if pattern.matches(&filename_str) {
208                    return true;
209                }
210            }
211        }
212
213        // Check glob patterns against full path
214        for pattern in &self.exclude_patterns {
215            if pattern.matches(&path_str) {
216                return true;
217            }
218        }
219
220        false
221    }
222
223    /// Check if a directory should be skipped entirely.
224    pub fn should_skip_dir(&self, dir_name: &str) -> bool {
225        if self.exclude_hidden && dir_name.starts_with('.') && dir_name != "." && dir_name != ".." {
226            return true;
227        }
228        self.exclude_dirs.contains(dir_name)
229    }
230
231    /// Check if hidden files/directories are excluded.
232    pub fn excludes_hidden(&self) -> bool {
233        self.exclude_hidden
234    }
235}
236
237/// Merkle tree represented as a map of file paths to their content hashes.
238pub type MerkleTree = HashMap<String, String>;
239
240/// Statistics about a Merkle tree.
241#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
242pub struct TreeStats {
243    /// Total number of files in the tree.
244    pub total_files: usize,
245    /// Number of unique directories.
246    pub total_dirs: usize,
247    /// Average path depth.
248    pub avg_path_depth: f64,
249}
250
251/// Manages Merkle trees for efficient change detection in code repositories.
252///
253/// Uses file content hashes to build a tree representation of the repository
254/// state, enabling fast detection of file changes without parsing.
255#[derive(Debug, Clone)]
256pub struct MerkleTreeManager {
257    exclusion_filter: ExclusionFilter,
258}
259
260impl Default for MerkleTreeManager {
261    fn default() -> Self {
262        Self::new(ExclusionFilter::default())
263    }
264}
265
266impl MerkleTreeManager {
267    /// Create a new MerkleTreeManager with the given exclusion filter.
268    pub fn new(exclusion_filter: ExclusionFilter) -> Self {
269        Self { exclusion_filter }
270    }
271
272    /// Build a Merkle tree for the given repository.
273    ///
274    /// Uses rayon for parallel file hashing to maximize performance.
275    ///
276    /// # Arguments
277    /// * `repo_path` - Path to the repository root
278    ///
279    /// # Returns
280    /// HashMap mapping relative file paths to their SHA-256 hashes
281    pub fn build_merkle_tree(&self, repo_path: &Path) -> Result<MerkleTree> {
282        let repo_path = repo_path
283            .canonicalize()
284            .map_err(|_| MerkleError::RepoNotFound(repo_path.to_path_buf()))?;
285
286        info!("Building Merkle tree for {:?}", repo_path);
287        let start = std::time::Instant::now();
288
289        // Find all files to process
290        let files = self.find_files(&repo_path)?;
291        info!("Found {} files to process", files.len());
292
293        // Hash files in parallel using rayon
294        let file_hashes: Vec<(String, Option<String>)> = files
295            .par_iter()
296            .map(|(abs_path, rel_path)| {
297                let hash = hash_file(abs_path);
298                (rel_path.clone(), hash)
299            })
300            .collect();
301
302        // Collect successful hashes
303        let mut tree = HashMap::new();
304        let mut failed_count = 0;
305
306        for (rel_path, hash_result) in file_hashes {
307            match hash_result {
308                Some(hash) => {
309                    tree.insert(rel_path, hash);
310                }
311                None => {
312                    failed_count += 1;
313                    debug!("Failed to hash: {}", rel_path);
314                }
315            }
316        }
317
318        let elapsed = start.elapsed();
319        let rate = tree.len() as f64 / elapsed.as_secs_f64();
320        info!(
321            "Built Merkle tree: {} files in {:.2}s ({:.0} files/sec)",
322            tree.len(),
323            elapsed.as_secs_f64(),
324            rate
325        );
326
327        if failed_count > 0 {
328            warn!("Failed to hash {} files", failed_count);
329        }
330
331        Ok(tree)
332    }
333
334    /// Find all files in the repository that should be included.
335    ///
336    /// Uses the `ignore` crate to respect `.gitignore` patterns automatically.
337    fn find_files(&self, repo_path: &Path) -> Result<Vec<(PathBuf, String)>> {
338        let mut files = Vec::new();
339
340        // Use ignore::WalkBuilder which respects .gitignore by default
341        let walker = WalkBuilder::new(repo_path)
342            .follow_links(false)
343            .hidden(self.exclusion_filter.excludes_hidden()) // Respect hidden file setting
344            .git_ignore(true) // Respect .gitignore
345            .git_global(true) // Respect global gitignore
346            .git_exclude(true) // Respect .git/info/exclude
347            .add_custom_ignore_filename(".codeprysmignore") // Respect .codeprysmignore
348            .build();
349
350        for entry in walker {
351            let entry = match entry {
352                Ok(e) => e,
353                Err(e) => {
354                    warn!("Error walking directory: {}", e);
355                    continue;
356                }
357            };
358
359            // Skip directories - we only want files
360            let file_type = match entry.file_type() {
361                Some(ft) => ft,
362                None => continue,
363            };
364            if !file_type.is_file() {
365                continue;
366            }
367
368            // Skip excluded directories (additional exclusions beyond gitignore)
369            let abs_path = entry.path();
370            let rel_path = abs_path
371                .strip_prefix(repo_path)
372                .unwrap_or(abs_path)
373                .to_string_lossy()
374                .replace('\\', "/"); // Normalize path separators
375
376            // Check our additional exclusion filter (for patterns not in .gitignore)
377            if self.exclusion_filter.should_exclude(Path::new(&rel_path)) {
378                continue;
379            }
380
381            files.push((abs_path.to_path_buf(), rel_path));
382        }
383
384        Ok(files)
385    }
386
387    /// Detect changes between two Merkle trees.
388    ///
389    /// # Arguments
390    /// * `old_tree` - Previous tree state (path -> hash)
391    /// * `new_tree` - Current tree state (path -> hash)
392    ///
393    /// # Returns
394    /// ChangeSet with detected changes
395    pub fn detect_changes(&self, old_tree: &MerkleTree, new_tree: &MerkleTree) -> ChangeSet {
396        info!("Detecting changes between Merkle trees");
397        let start = std::time::Instant::now();
398
399        let old_files: HashSet<&String> = old_tree.keys().collect();
400        let new_files: HashSet<&String> = new_tree.keys().collect();
401
402        // Find modified files (in both trees but with different hashes)
403        let modified: Vec<String> = old_files
404            .intersection(&new_files)
405            .filter(|path| old_tree.get(**path) != new_tree.get(**path))
406            .map(|s| (*s).clone())
407            .collect();
408
409        // Find added files (in new but not old)
410        let added: Vec<String> = new_files
411            .difference(&old_files)
412            .map(|s| (*s).clone())
413            .collect();
414
415        // Find deleted files (in old but not new)
416        let deleted: Vec<String> = old_files
417            .difference(&new_files)
418            .map(|s| (*s).clone())
419            .collect();
420
421        let changeset = ChangeSet {
422            modified,
423            added,
424            deleted,
425        };
426
427        let elapsed = start.elapsed();
428        info!(
429            "Change detection completed in {:.3}s: {} modified, {} added, {} deleted",
430            elapsed.as_secs_f64(),
431            changeset.modified.len(),
432            changeset.added.len(),
433            changeset.deleted.len()
434        );
435
436        changeset
437    }
438
439    /// Get statistics about a Merkle tree.
440    pub fn get_tree_stats(&self, tree: &MerkleTree) -> TreeStats {
441        if tree.is_empty() {
442            return TreeStats {
443                total_files: 0,
444                total_dirs: 0,
445                avg_path_depth: 0.0,
446            };
447        }
448
449        let total_files = tree.len();
450        let mut dirs: HashSet<String> = HashSet::new();
451        let mut total_depth = 0usize;
452
453        for file_path in tree.keys() {
454            let path = Path::new(file_path);
455            let depth = path.components().count();
456            total_depth += depth;
457
458            // Collect all parent directories
459            let mut current = PathBuf::new();
460            for component in path.components() {
461                if let std::path::Component::Normal(_) = component {
462                    current.push(component);
463                    if current != path {
464                        dirs.insert(current.to_string_lossy().to_string());
465                    }
466                }
467            }
468        }
469
470        TreeStats {
471            total_files,
472            total_dirs: dirs.len(),
473            avg_path_depth: total_depth as f64 / total_files as f64,
474        }
475    }
476
477    /// Save a Merkle tree to a JSON file.
478    pub fn save_tree_to_file(&self, tree: &MerkleTree, file_path: &Path) -> Result<()> {
479        let file = File::create(file_path)?;
480        serde_json::to_writer_pretty(file, tree)?;
481        info!(
482            "Saved Merkle tree with {} files to {:?}",
483            tree.len(),
484            file_path
485        );
486        Ok(())
487    }
488
489    /// Load a Merkle tree from a JSON file.
490    pub fn load_tree_from_file(&self, file_path: &Path) -> Result<MerkleTree> {
491        match File::open(file_path) {
492            Ok(file) => {
493                let reader = BufReader::new(file);
494                let tree: MerkleTree = serde_json::from_reader(reader)?;
495                info!(
496                    "Loaded Merkle tree with {} files from {:?}",
497                    tree.len(),
498                    file_path
499                );
500                Ok(tree)
501            }
502            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
503                warn!("Merkle tree file not found: {:?}", file_path);
504                Ok(HashMap::new())
505            }
506            Err(e) => Err(MerkleError::Io(e)),
507        }
508    }
509}
510
511/// Compute SHA-256 hash of a single file.
512///
513/// Reads the file in chunks to handle large files efficiently.
514/// Compute the SHA-256 hash of a file's contents.
515///
516/// This is the public API for file hashing used by the graph builder.
517///
518/// # Arguments
519///
520/// * `file_path` - Path to the file to hash
521///
522/// # Returns
523///
524/// The hex-encoded SHA-256 hash string, or an IO error.
525pub fn compute_file_hash(file_path: &Path) -> std::io::Result<String> {
526    hash_file(file_path).ok_or_else(|| {
527        std::io::Error::other(format!("Failed to hash file: {}", file_path.display()))
528    })
529}
530
531fn hash_file(file_path: &Path) -> Option<String> {
532    let file = match File::open(file_path) {
533        Ok(f) => f,
534        Err(e) => {
535            debug!("Cannot open {:?}: {}", file_path, e);
536            return None;
537        }
538    };
539
540    let mut reader = BufReader::with_capacity(8192, file);
541    let mut hasher = Sha256::new();
542    let mut buffer = [0u8; 8192];
543
544    loop {
545        match reader.read(&mut buffer) {
546            Ok(0) => break,
547            Ok(n) => hasher.update(&buffer[..n]),
548            Err(e) => {
549                debug!("Error reading {:?}: {}", file_path, e);
550                return None;
551            }
552        }
553    }
554
555    Some(format!("{:x}", hasher.finalize()))
556}
557
558#[cfg(test)]
559mod tests {
560    use super::*;
561    use std::fs;
562    use tempfile::TempDir;
563
564    #[test]
565    fn test_changeset_empty() {
566        let cs = ChangeSet::new();
567        assert!(!cs.has_changes());
568        assert_eq!(cs.total_changes(), 0);
569    }
570
571    #[test]
572    fn test_changeset_with_changes() {
573        let cs = ChangeSet {
574            modified: vec!["a.rs".to_string()],
575            added: vec!["b.rs".to_string(), "c.rs".to_string()],
576            deleted: vec![],
577        };
578        assert!(cs.has_changes());
579        assert_eq!(cs.total_changes(), 3);
580        assert_eq!(cs.files_to_process().len(), 3);
581    }
582
583    #[test]
584    fn test_exclusion_filter_default() {
585        let filter = ExclusionFilter::default();
586
587        // Should exclude hidden files
588        assert!(filter.should_exclude(Path::new(".hidden")));
589        assert!(filter.should_exclude(Path::new("dir/.hidden")));
590
591        // Should exclude default directories
592        assert!(filter.should_exclude(Path::new(".git/config")));
593        assert!(filter.should_exclude(Path::new("node_modules/package.json")));
594
595        // Should exclude binary extensions
596        assert!(filter.should_exclude(Path::new("image.png")));
597        assert!(filter.should_exclude(Path::new("lib.so")));
598
599        // Should not exclude source files
600        assert!(!filter.should_exclude(Path::new("main.rs")));
601        assert!(!filter.should_exclude(Path::new("src/lib.rs")));
602    }
603
604    #[test]
605    fn test_hash_file() {
606        let temp_dir = TempDir::new().unwrap();
607        let file_path = temp_dir.path().join("test.txt");
608        fs::write(&file_path, "hello world").unwrap();
609
610        let hash = hash_file(&file_path);
611        assert!(hash.is_some());
612        // SHA-256 hash of "hello world"
613        assert_eq!(
614            hash.unwrap(),
615            "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
616        );
617    }
618
619    #[test]
620    fn test_merkle_tree_build() {
621        let temp_dir = TempDir::new().unwrap();
622
623        // Create some test files
624        fs::write(temp_dir.path().join("main.rs"), "fn main() {}").unwrap();
625        fs::create_dir(temp_dir.path().join("src")).unwrap();
626        fs::write(temp_dir.path().join("src/lib.rs"), "pub mod foo;").unwrap();
627
628        // Use filter that doesn't exclude hidden paths (tempdir paths may contain hidden components)
629        let filter = ExclusionFilter::new(None, false);
630        let manager = MerkleTreeManager::new(filter);
631        let tree = manager.build_merkle_tree(temp_dir.path()).unwrap();
632
633        assert_eq!(tree.len(), 2);
634        assert!(tree.contains_key("main.rs"));
635        assert!(tree.contains_key("src/lib.rs"));
636    }
637
638    #[test]
639    fn test_detect_changes() {
640        let manager = MerkleTreeManager::default();
641
642        let mut old_tree = HashMap::new();
643        old_tree.insert("a.rs".to_string(), "hash1".to_string());
644        old_tree.insert("b.rs".to_string(), "hash2".to_string());
645        old_tree.insert("c.rs".to_string(), "hash3".to_string());
646
647        let mut new_tree = HashMap::new();
648        new_tree.insert("a.rs".to_string(), "hash1".to_string()); // unchanged
649        new_tree.insert("b.rs".to_string(), "hash2_modified".to_string()); // modified
650        new_tree.insert("d.rs".to_string(), "hash4".to_string()); // added
651
652        let changes = manager.detect_changes(&old_tree, &new_tree);
653
654        assert_eq!(changes.modified, vec!["b.rs"]);
655        assert_eq!(changes.added, vec!["d.rs"]);
656        assert_eq!(changes.deleted, vec!["c.rs"]);
657    }
658
659    #[test]
660    fn test_tree_stats() {
661        let manager = MerkleTreeManager::default();
662
663        let mut tree = HashMap::new();
664        tree.insert("main.rs".to_string(), "hash1".to_string());
665        tree.insert("src/lib.rs".to_string(), "hash2".to_string());
666        tree.insert("src/core/mod.rs".to_string(), "hash3".to_string());
667
668        let stats = manager.get_tree_stats(&tree);
669
670        assert_eq!(stats.total_files, 3);
671        assert!(stats.total_dirs >= 2); // src and src/core
672    }
673
674    #[test]
675    fn test_save_and_load_tree() {
676        let temp_dir = TempDir::new().unwrap();
677        let tree_path = temp_dir.path().join("tree.json");
678
679        let manager = MerkleTreeManager::default();
680
681        let mut tree = HashMap::new();
682        tree.insert("main.rs".to_string(), "hash1".to_string());
683        tree.insert("lib.rs".to_string(), "hash2".to_string());
684
685        manager.save_tree_to_file(&tree, &tree_path).unwrap();
686        let loaded_tree = manager.load_tree_from_file(&tree_path).unwrap();
687
688        assert_eq!(tree, loaded_tree);
689    }
690}