1use ignore::WalkBuilder;
7use rayon::prelude::*;
8use sha2::{Digest, Sha256};
9use std::collections::{HashMap, HashSet};
10use std::fs::File;
11use std::io::{BufReader, Read};
12use std::path::{Path, PathBuf};
13use thiserror::Error;
14use tracing::{debug, info, warn};
15
16#[derive(Error, Debug)]
18pub enum MerkleError {
19 #[error("IO error: {0}")]
20 Io(#[from] std::io::Error),
21
22 #[error("JSON error: {0}")]
23 Json(#[from] serde_json::Error),
24
25 #[error("Repository path does not exist: {0}")]
26 RepoNotFound(PathBuf),
27
28 #[error("Failed to hash file {path}: {reason}")]
29 HashError { path: PathBuf, reason: String },
30}
31
32pub type Result<T> = std::result::Result<T, MerkleError>;
34
35#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
37pub struct ChangeSet {
38 pub modified: Vec<String>,
40 pub added: Vec<String>,
42 pub deleted: Vec<String>,
44}
45
46impl ChangeSet {
47 pub fn new() -> Self {
49 Self::default()
50 }
51
52 pub fn has_changes(&self) -> bool {
54 !self.modified.is_empty() || !self.added.is_empty() || !self.deleted.is_empty()
55 }
56
57 pub fn total_changes(&self) -> usize {
59 self.modified.len() + self.added.len() + self.deleted.len()
60 }
61
62 pub fn files_to_process(&self) -> Vec<&str> {
64 self.modified
65 .iter()
66 .chain(self.added.iter())
67 .map(|s| s.as_str())
68 .collect()
69 }
70}
71
72const DEFAULT_EXCLUDE_EXTENSIONS: &[&str] = &[
74 "*.pyc",
75 "*.pyo",
76 "*.jpg",
77 "*.jpeg",
78 "*.png",
79 "*.gif",
80 "*.bmp",
81 "*.ico",
82 "*.svg",
83 "*.pdf",
84 "*.zip",
85 "*.tar",
86 "*.gz",
87 "*.rar",
88 "*.7z",
89 "*.exe",
90 "*.dll",
91 "*.so",
92 "*.dylib",
93 "*.o",
94 "*.a",
95 "*.lib",
96 "*.class",
97 "*.jar",
98 "*.war",
99 "*.whl",
100 "*.egg",
101 "*.db",
102 "*.sqlite",
103 "*.sqlite3",
104];
105
106const DEFAULT_EXCLUDE_DIRS: &[&str] = &[
108 ".git",
109 ".DS_Store",
110 "__pycache__",
111 "node_modules",
112 ".venv",
113 "venv",
114 ".env",
115 "target",
116 "build",
117 "dist",
118 ".idea",
119 ".vscode",
120];
121
122#[derive(Debug, Clone)]
124pub struct ExclusionFilter {
125 exclude_patterns: Vec<glob::Pattern>,
127 exclude_dirs: HashSet<String>,
129 exclude_hidden: bool,
131}
132
133impl Default for ExclusionFilter {
134 fn default() -> Self {
135 Self::new(None, true)
136 }
137}
138
139impl ExclusionFilter {
140 pub fn new(custom_patterns: Option<&[&str]>, exclude_hidden: bool) -> Self {
146 let mut patterns = Vec::new();
147
148 for ext in DEFAULT_EXCLUDE_EXTENSIONS {
150 if let Ok(p) = glob::Pattern::new(ext) {
151 patterns.push(p);
152 }
153 }
154
155 if let Some(custom) = custom_patterns {
157 for pattern in custom {
158 if let Ok(p) = glob::Pattern::new(pattern) {
159 patterns.push(p);
160 }
161 }
162 }
163
164 let exclude_dirs: HashSet<String> =
166 DEFAULT_EXCLUDE_DIRS.iter().map(|s| s.to_string()).collect();
167
168 Self {
169 exclude_patterns: patterns,
170 exclude_dirs,
171 exclude_hidden,
172 }
173 }
174
175 pub fn should_exclude(&self, path: &Path) -> bool {
177 let path_str = path.to_string_lossy();
178
179 if self.exclude_hidden {
181 for component in path.components() {
182 if let std::path::Component::Normal(name) = component {
183 if let Some(name_str) = name.to_str() {
184 if name_str.starts_with('.') && name_str != "." && name_str != ".." {
185 return true;
186 }
187 }
188 }
189 }
190 }
191
192 for component in path.components() {
194 if let std::path::Component::Normal(name) = component {
195 if let Some(name_str) = name.to_str() {
196 if self.exclude_dirs.contains(name_str) {
197 return true;
198 }
199 }
200 }
201 }
202
203 if let Some(filename) = path.file_name() {
205 let filename_str = filename.to_string_lossy();
206 for pattern in &self.exclude_patterns {
207 if pattern.matches(&filename_str) {
208 return true;
209 }
210 }
211 }
212
213 for pattern in &self.exclude_patterns {
215 if pattern.matches(&path_str) {
216 return true;
217 }
218 }
219
220 false
221 }
222
223 pub fn should_skip_dir(&self, dir_name: &str) -> bool {
225 if self.exclude_hidden && dir_name.starts_with('.') && dir_name != "." && dir_name != ".." {
226 return true;
227 }
228 self.exclude_dirs.contains(dir_name)
229 }
230
231 pub fn excludes_hidden(&self) -> bool {
233 self.exclude_hidden
234 }
235}
236
237pub type MerkleTree = HashMap<String, String>;
239
240#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
242pub struct TreeStats {
243 pub total_files: usize,
245 pub total_dirs: usize,
247 pub avg_path_depth: f64,
249}
250
251#[derive(Debug, Clone)]
256pub struct MerkleTreeManager {
257 exclusion_filter: ExclusionFilter,
258}
259
260impl Default for MerkleTreeManager {
261 fn default() -> Self {
262 Self::new(ExclusionFilter::default())
263 }
264}
265
266impl MerkleTreeManager {
267 pub fn new(exclusion_filter: ExclusionFilter) -> Self {
269 Self { exclusion_filter }
270 }
271
272 pub fn build_merkle_tree(&self, repo_path: &Path) -> Result<MerkleTree> {
282 let repo_path = repo_path
283 .canonicalize()
284 .map_err(|_| MerkleError::RepoNotFound(repo_path.to_path_buf()))?;
285
286 info!("Building Merkle tree for {:?}", repo_path);
287 let start = std::time::Instant::now();
288
289 let files = self.find_files(&repo_path)?;
291 info!("Found {} files to process", files.len());
292
293 let file_hashes: Vec<(String, Option<String>)> = files
295 .par_iter()
296 .map(|(abs_path, rel_path)| {
297 let hash = hash_file(abs_path);
298 (rel_path.clone(), hash)
299 })
300 .collect();
301
302 let mut tree = HashMap::new();
304 let mut failed_count = 0;
305
306 for (rel_path, hash_result) in file_hashes {
307 match hash_result {
308 Some(hash) => {
309 tree.insert(rel_path, hash);
310 }
311 None => {
312 failed_count += 1;
313 debug!("Failed to hash: {}", rel_path);
314 }
315 }
316 }
317
318 let elapsed = start.elapsed();
319 let rate = tree.len() as f64 / elapsed.as_secs_f64();
320 info!(
321 "Built Merkle tree: {} files in {:.2}s ({:.0} files/sec)",
322 tree.len(),
323 elapsed.as_secs_f64(),
324 rate
325 );
326
327 if failed_count > 0 {
328 warn!("Failed to hash {} files", failed_count);
329 }
330
331 Ok(tree)
332 }
333
334 fn find_files(&self, repo_path: &Path) -> Result<Vec<(PathBuf, String)>> {
338 let mut files = Vec::new();
339
340 let walker = WalkBuilder::new(repo_path)
342 .follow_links(false)
343 .hidden(self.exclusion_filter.excludes_hidden()) .git_ignore(true) .git_global(true) .git_exclude(true) .add_custom_ignore_filename(".codeprysmignore") .build();
349
350 for entry in walker {
351 let entry = match entry {
352 Ok(e) => e,
353 Err(e) => {
354 warn!("Error walking directory: {}", e);
355 continue;
356 }
357 };
358
359 let file_type = match entry.file_type() {
361 Some(ft) => ft,
362 None => continue,
363 };
364 if !file_type.is_file() {
365 continue;
366 }
367
368 let abs_path = entry.path();
370 let rel_path = abs_path
371 .strip_prefix(repo_path)
372 .unwrap_or(abs_path)
373 .to_string_lossy()
374 .replace('\\', "/"); if self.exclusion_filter.should_exclude(Path::new(&rel_path)) {
378 continue;
379 }
380
381 files.push((abs_path.to_path_buf(), rel_path));
382 }
383
384 Ok(files)
385 }
386
387 pub fn detect_changes(&self, old_tree: &MerkleTree, new_tree: &MerkleTree) -> ChangeSet {
396 info!("Detecting changes between Merkle trees");
397 let start = std::time::Instant::now();
398
399 let old_files: HashSet<&String> = old_tree.keys().collect();
400 let new_files: HashSet<&String> = new_tree.keys().collect();
401
402 let modified: Vec<String> = old_files
404 .intersection(&new_files)
405 .filter(|path| old_tree.get(**path) != new_tree.get(**path))
406 .map(|s| (*s).clone())
407 .collect();
408
409 let added: Vec<String> = new_files
411 .difference(&old_files)
412 .map(|s| (*s).clone())
413 .collect();
414
415 let deleted: Vec<String> = old_files
417 .difference(&new_files)
418 .map(|s| (*s).clone())
419 .collect();
420
421 let changeset = ChangeSet {
422 modified,
423 added,
424 deleted,
425 };
426
427 let elapsed = start.elapsed();
428 info!(
429 "Change detection completed in {:.3}s: {} modified, {} added, {} deleted",
430 elapsed.as_secs_f64(),
431 changeset.modified.len(),
432 changeset.added.len(),
433 changeset.deleted.len()
434 );
435
436 changeset
437 }
438
439 pub fn get_tree_stats(&self, tree: &MerkleTree) -> TreeStats {
441 if tree.is_empty() {
442 return TreeStats {
443 total_files: 0,
444 total_dirs: 0,
445 avg_path_depth: 0.0,
446 };
447 }
448
449 let total_files = tree.len();
450 let mut dirs: HashSet<String> = HashSet::new();
451 let mut total_depth = 0usize;
452
453 for file_path in tree.keys() {
454 let path = Path::new(file_path);
455 let depth = path.components().count();
456 total_depth += depth;
457
458 let mut current = PathBuf::new();
460 for component in path.components() {
461 if let std::path::Component::Normal(_) = component {
462 current.push(component);
463 if current != path {
464 dirs.insert(current.to_string_lossy().to_string());
465 }
466 }
467 }
468 }
469
470 TreeStats {
471 total_files,
472 total_dirs: dirs.len(),
473 avg_path_depth: total_depth as f64 / total_files as f64,
474 }
475 }
476
477 pub fn save_tree_to_file(&self, tree: &MerkleTree, file_path: &Path) -> Result<()> {
479 let file = File::create(file_path)?;
480 serde_json::to_writer_pretty(file, tree)?;
481 info!(
482 "Saved Merkle tree with {} files to {:?}",
483 tree.len(),
484 file_path
485 );
486 Ok(())
487 }
488
489 pub fn load_tree_from_file(&self, file_path: &Path) -> Result<MerkleTree> {
491 match File::open(file_path) {
492 Ok(file) => {
493 let reader = BufReader::new(file);
494 let tree: MerkleTree = serde_json::from_reader(reader)?;
495 info!(
496 "Loaded Merkle tree with {} files from {:?}",
497 tree.len(),
498 file_path
499 );
500 Ok(tree)
501 }
502 Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
503 warn!("Merkle tree file not found: {:?}", file_path);
504 Ok(HashMap::new())
505 }
506 Err(e) => Err(MerkleError::Io(e)),
507 }
508 }
509}
510
511pub fn compute_file_hash(file_path: &Path) -> std::io::Result<String> {
526 hash_file(file_path).ok_or_else(|| {
527 std::io::Error::other(format!("Failed to hash file: {}", file_path.display()))
528 })
529}
530
531fn hash_file(file_path: &Path) -> Option<String> {
532 let file = match File::open(file_path) {
533 Ok(f) => f,
534 Err(e) => {
535 debug!("Cannot open {:?}: {}", file_path, e);
536 return None;
537 }
538 };
539
540 let mut reader = BufReader::with_capacity(8192, file);
541 let mut hasher = Sha256::new();
542 let mut buffer = [0u8; 8192];
543
544 loop {
545 match reader.read(&mut buffer) {
546 Ok(0) => break,
547 Ok(n) => hasher.update(&buffer[..n]),
548 Err(e) => {
549 debug!("Error reading {:?}: {}", file_path, e);
550 return None;
551 }
552 }
553 }
554
555 Some(format!("{:x}", hasher.finalize()))
556}
557
558#[cfg(test)]
559mod tests {
560 use super::*;
561 use std::fs;
562 use tempfile::TempDir;
563
564 #[test]
565 fn test_changeset_empty() {
566 let cs = ChangeSet::new();
567 assert!(!cs.has_changes());
568 assert_eq!(cs.total_changes(), 0);
569 }
570
571 #[test]
572 fn test_changeset_with_changes() {
573 let cs = ChangeSet {
574 modified: vec!["a.rs".to_string()],
575 added: vec!["b.rs".to_string(), "c.rs".to_string()],
576 deleted: vec![],
577 };
578 assert!(cs.has_changes());
579 assert_eq!(cs.total_changes(), 3);
580 assert_eq!(cs.files_to_process().len(), 3);
581 }
582
583 #[test]
584 fn test_exclusion_filter_default() {
585 let filter = ExclusionFilter::default();
586
587 assert!(filter.should_exclude(Path::new(".hidden")));
589 assert!(filter.should_exclude(Path::new("dir/.hidden")));
590
591 assert!(filter.should_exclude(Path::new(".git/config")));
593 assert!(filter.should_exclude(Path::new("node_modules/package.json")));
594
595 assert!(filter.should_exclude(Path::new("image.png")));
597 assert!(filter.should_exclude(Path::new("lib.so")));
598
599 assert!(!filter.should_exclude(Path::new("main.rs")));
601 assert!(!filter.should_exclude(Path::new("src/lib.rs")));
602 }
603
604 #[test]
605 fn test_hash_file() {
606 let temp_dir = TempDir::new().unwrap();
607 let file_path = temp_dir.path().join("test.txt");
608 fs::write(&file_path, "hello world").unwrap();
609
610 let hash = hash_file(&file_path);
611 assert!(hash.is_some());
612 assert_eq!(
614 hash.unwrap(),
615 "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
616 );
617 }
618
619 #[test]
620 fn test_merkle_tree_build() {
621 let temp_dir = TempDir::new().unwrap();
622
623 fs::write(temp_dir.path().join("main.rs"), "fn main() {}").unwrap();
625 fs::create_dir(temp_dir.path().join("src")).unwrap();
626 fs::write(temp_dir.path().join("src/lib.rs"), "pub mod foo;").unwrap();
627
628 let filter = ExclusionFilter::new(None, false);
630 let manager = MerkleTreeManager::new(filter);
631 let tree = manager.build_merkle_tree(temp_dir.path()).unwrap();
632
633 assert_eq!(tree.len(), 2);
634 assert!(tree.contains_key("main.rs"));
635 assert!(tree.contains_key("src/lib.rs"));
636 }
637
638 #[test]
639 fn test_detect_changes() {
640 let manager = MerkleTreeManager::default();
641
642 let mut old_tree = HashMap::new();
643 old_tree.insert("a.rs".to_string(), "hash1".to_string());
644 old_tree.insert("b.rs".to_string(), "hash2".to_string());
645 old_tree.insert("c.rs".to_string(), "hash3".to_string());
646
647 let mut new_tree = HashMap::new();
648 new_tree.insert("a.rs".to_string(), "hash1".to_string()); new_tree.insert("b.rs".to_string(), "hash2_modified".to_string()); new_tree.insert("d.rs".to_string(), "hash4".to_string()); let changes = manager.detect_changes(&old_tree, &new_tree);
653
654 assert_eq!(changes.modified, vec!["b.rs"]);
655 assert_eq!(changes.added, vec!["d.rs"]);
656 assert_eq!(changes.deleted, vec!["c.rs"]);
657 }
658
659 #[test]
660 fn test_tree_stats() {
661 let manager = MerkleTreeManager::default();
662
663 let mut tree = HashMap::new();
664 tree.insert("main.rs".to_string(), "hash1".to_string());
665 tree.insert("src/lib.rs".to_string(), "hash2".to_string());
666 tree.insert("src/core/mod.rs".to_string(), "hash3".to_string());
667
668 let stats = manager.get_tree_stats(&tree);
669
670 assert_eq!(stats.total_files, 3);
671 assert!(stats.total_dirs >= 2); }
673
674 #[test]
675 fn test_save_and_load_tree() {
676 let temp_dir = TempDir::new().unwrap();
677 let tree_path = temp_dir.path().join("tree.json");
678
679 let manager = MerkleTreeManager::default();
680
681 let mut tree = HashMap::new();
682 tree.insert("main.rs".to_string(), "hash1".to_string());
683 tree.insert("lib.rs".to_string(), "hash2".to_string());
684
685 manager.save_tree_to_file(&tree, &tree_path).unwrap();
686 let loaded_tree = manager.load_tree_from_file(&tree_path).unwrap();
687
688 assert_eq!(tree, loaded_tree);
689 }
690}