Skip to main content

compare_dir/
lib.rs

1use indicatif::{ProgressBar, ProgressStyle};
2use log::info;
3use rayon::prelude::*;
4use std::cmp::Ordering;
5use std::collections::HashMap;
6use std::fs;
7use std::io::{self, Read};
8use std::path::{Path, PathBuf};
9use std::sync::{Arc, Mutex, mpsc};
10use walkdir::WalkDir;
11
12/// How a file is classified during comparison.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub enum Classification {
15    /// File exists only in the first directory.
16    OnlyInDir1,
17    /// File exists only in the second directory.
18    OnlyInDir2,
19    /// File exists in both directories.
20    InBoth,
21}
22
23/// Detailed result of comparing a single file.
24#[derive(Debug, Clone)]
25pub struct FileComparisonResult {
26    /// The path relative to the root of the directories.
27    pub relative_path: PathBuf,
28    /// Whether the file exists in one or both directories.
29    pub classification: Classification,
30    /// Comparison of the last modified time, if applicable.
31    pub modified_time_comparison: Option<Ordering>,
32    /// Comparison of the file size, if applicable.
33    pub size_comparison: Option<Ordering>,
34    /// Whether the content is byte-for-byte identical, if applicable.
35    pub is_content_same: Option<bool>,
36}
37
38impl FileComparisonResult {
39    pub fn new(relative_path: PathBuf, classification: Classification) -> Self {
40        Self {
41            relative_path,
42            classification,
43            modified_time_comparison: None,
44            size_comparison: None,
45            is_content_same: None,
46        }
47    }
48
49    fn update(&mut self, path1: &Path, path2: &Path, buffer_size: usize) -> anyhow::Result<()> {
50        let m1 = fs::metadata(path1)?;
51        let m2 = fs::metadata(path2)?;
52        let t1 = m1.modified()?;
53        let t2 = m2.modified()?;
54        self.modified_time_comparison = Some(t1.cmp(&t2));
55
56        let s1 = m1.len();
57        let s2 = m2.len();
58        self.size_comparison = Some(s1.cmp(&s2));
59
60        if s1 == s2 {
61            log::info!("Comparing content: {:?}", self.relative_path);
62            self.is_content_same = Some(Self::compare_contents(path1, path2, buffer_size)?);
63        }
64        Ok(())
65    }
66
67    fn compare_contents(path1: &Path, path2: &Path, buffer_size: usize) -> io::Result<bool> {
68        let mut f1 = fs::File::open(path1)?;
69        let mut f2 = fs::File::open(path2)?;
70
71        let mut buf1 = vec![0u8; buffer_size];
72        let mut buf2 = vec![0u8; buffer_size];
73
74        loop {
75            // Safety from Deadlocks: rayon::join is specifically designed for nested parallelism.
76            // It uses work-stealing, meaning if all threads in the pool are busy, the thread
77            // calling join will just execute both tasks itself.
78            let (n1, n2) = rayon::join(|| f1.read(&mut buf1), || f2.read(&mut buf2));
79            let n1 = n1?;
80            let n2 = n2?;
81
82            if n1 != n2 || buf1[..n1] != buf2[..n2] {
83                return Ok(false);
84            }
85
86            if n1 == 0 {
87                return Ok(true);
88            }
89        }
90    }
91
92    pub fn is_identical(&self) -> bool {
93        self.classification == Classification::InBoth
94            && self.modified_time_comparison == Some(Ordering::Equal)
95            && self.size_comparison == Some(Ordering::Equal)
96            && self.is_content_same == Some(true)
97    }
98
99    pub fn to_string(&self, dir1_name: &str, dir2_name: &str) -> String {
100        let mut parts = Vec::new();
101        match self.classification {
102            Classification::OnlyInDir1 => parts.push(format!("Only in {}", dir1_name)),
103            Classification::OnlyInDir2 => parts.push(format!("Only in {}", dir2_name)),
104            Classification::InBoth => {}
105        }
106
107        if let Some(comp) = &self.modified_time_comparison {
108            match comp {
109                Ordering::Greater => parts.push(format!("{} is newer", dir1_name)),
110                Ordering::Less => parts.push(format!("{} is newer", dir2_name)),
111                Ordering::Equal => {}
112            }
113        }
114
115        if let Some(comp) = &self.size_comparison {
116            match comp {
117                Ordering::Greater => parts.push(format!("Size of {} is larger", dir1_name)),
118                Ordering::Less => parts.push(format!("Size of {} is larger", dir2_name)),
119                Ordering::Equal => {}
120            }
121        }
122
123        if let Some(same) = self.is_content_same
124            && !same
125        {
126            parts.push("Content differ".to_string());
127        }
128
129        format!("{}: {}", self.relative_path.display(), parts.join(", "))
130    }
131}
132
133#[derive(Default)]
134pub struct ComparisonSummary {
135    pub in_both: usize,
136    pub only_in_dir1: usize,
137    pub only_in_dir2: usize,
138    pub dir1_newer: usize,
139    pub dir2_newer: usize,
140    pub same_time_diff_size: usize,
141    pub same_time_size_diff_content: usize,
142}
143
144impl ComparisonSummary {
145    pub fn update(&mut self, result: &FileComparisonResult) {
146        match result.classification {
147            Classification::OnlyInDir1 => self.only_in_dir1 += 1,
148            Classification::OnlyInDir2 => self.only_in_dir2 += 1,
149            Classification::InBoth => {
150                self.in_both += 1;
151                match result.modified_time_comparison {
152                    Some(Ordering::Greater) => self.dir1_newer += 1,
153                    Some(Ordering::Less) => self.dir2_newer += 1,
154                    _ => {
155                        if result.size_comparison != Some(Ordering::Equal) {
156                            self.same_time_diff_size += 1;
157                        } else if result.is_content_same == Some(false) {
158                            self.same_time_size_diff_content += 1;
159                        }
160                    }
161                }
162            }
163        }
164    }
165
166    pub fn print(&self, dir1_name: &str, dir2_name: &str) {
167        println!("Files in both: {}", self.in_both);
168        println!("Files only in {}: {}", dir1_name, self.only_in_dir1);
169        println!("Files only in {}: {}", dir2_name, self.only_in_dir2);
170        println!(
171            "Files in both ({} is newer): {}",
172            dir1_name, self.dir1_newer
173        );
174        println!(
175            "Files in both ({} is newer): {}",
176            dir2_name, self.dir2_newer
177        );
178        println!(
179            "Files in both (same time, different size): {}",
180            self.same_time_diff_size
181        );
182        println!(
183            "Files in both (same time and size, different content): {}",
184            self.same_time_size_diff_content
185        );
186    }
187}
188
189/// A tool for comparing the contents of two directories.
190#[derive(Clone)]
191pub struct DirectoryComparer {
192    dir1: PathBuf,
193    dir2: PathBuf,
194    total_files: Arc<Mutex<usize>>,
195    buffer_size: usize,
196}
197
198impl DirectoryComparer {
199    /// Creates a new `DirectoryComparer` for the two given directories.
200    pub fn new(dir1: PathBuf, dir2: PathBuf) -> Self {
201        Self {
202            dir1,
203            dir2,
204            total_files: Arc::new(Mutex::new(0)),
205            buffer_size: 64 * 1024,
206        }
207    }
208
209    /// Sets the buffer size for file comparison in bytes.
210    pub fn set_buffer_size(&mut self, size: usize) {
211        self.buffer_size = size;
212    }
213
214    /// Sets the maximum number of threads for parallel processing.
215    /// This initializes the global Rayon thread pool.
216    pub fn set_max_threads(parallel: usize) -> anyhow::Result<()> {
217        rayon::ThreadPoolBuilder::new()
218            .num_threads(parallel)
219            .build_global()
220            .map_err(|e| anyhow::anyhow!("Failed to initialize thread pool: {}", e))?;
221        Ok(())
222    }
223
224    /// Executes the directory comparison and prints results to stdout.
225    /// This is a convenience method for CLI usage.
226    pub fn run(&self) -> anyhow::Result<()> {
227        let pb = ProgressBar::new_spinner();
228        pb.enable_steady_tick(std::time::Duration::from_millis(120));
229        pb.set_style(
230            ProgressStyle::with_template("{spinner:.green} [{elapsed_precise}] {msg}").unwrap(),
231        );
232        pb.set_message("Scanning directories...");
233
234        let start_time = std::time::Instant::now();
235        let mut summary = ComparisonSummary::default();
236        let dir1_str = self.dir1.to_str().unwrap_or("dir1");
237        let dir2_str = self.dir2.to_str().unwrap_or("dir2");
238
239        let (tx, rx) = mpsc::channel();
240        let comparer = self.clone();
241
242        std::thread::scope(|s| {
243            s.spawn(move || {
244                if let Err(e) = comparer.compare_streaming(tx) {
245                    log::error!("Error during comparison: {}", e);
246                }
247            });
248
249            // Receive results and update summary/UI
250            let mut length_set = false;
251            while let Ok(result) = rx.recv() {
252                if !length_set {
253                    let total_files = *self.total_files.lock().unwrap();
254                    if total_files > 0 {
255                        pb.set_length(total_files as u64);
256                        pb.set_style(
257                            ProgressStyle::with_template(
258                                "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} ({percent}%) {msg}",
259                            )
260                            .unwrap(),
261                        );
262                        pb.set_message("");
263                        length_set = true;
264                    }
265                }
266                summary.update(&result);
267                if !result.is_identical() {
268                    pb.suspend(|| {
269                        println!("{}", result.to_string(dir1_str, dir2_str));
270                    });
271                }
272                pb.inc(1);
273            }
274        });
275
276        pb.finish_and_clear();
277
278        eprintln!("\n--- Comparison Summary ---");
279        summary.print(dir1_str, dir2_str);
280        eprintln!("Comparison finished in {:?}.", start_time.elapsed());
281        Ok(())
282    }
283
284    fn get_files(dir: &Path) -> anyhow::Result<HashMap<PathBuf, PathBuf>> {
285        let mut files = HashMap::new();
286        for entry in WalkDir::new(dir).into_iter().filter_map(|e| e.ok()) {
287            if entry.file_type().is_file() {
288                let rel_path = entry.path().strip_prefix(dir)?.to_path_buf();
289                files.insert(rel_path, entry.path().to_path_buf());
290            }
291        }
292        Ok(files)
293    }
294
295    /// Performs the directory comparison and streams results via a channel.
296    ///
297    /// # Arguments
298    /// * `tx` - A sender to transmit `FileComparisonResult` as they are computed.
299    fn compare_streaming(&self, tx: mpsc::Sender<FileComparisonResult>) -> anyhow::Result<()> {
300        let (tx_unordered, rx_unordered) = mpsc::channel();
301        let comparer = self.clone();
302
303        std::thread::scope(|s| {
304            s.spawn(move || {
305                if let Err(e) = comparer.compare_unordered_streaming(tx_unordered) {
306                    log::error!("Error during unordered comparison: {}", e);
307                }
308            });
309
310            let mut buffer = HashMap::new();
311            let mut next_index = 0;
312            let mut total_len: Option<usize> = None;
313
314            while total_len.is_none() || next_index < total_len.unwrap() {
315                match rx_unordered.recv() {
316                    Ok((i, result)) => {
317                        if total_len.is_none() {
318                            total_len = Some(*self.total_files.lock().unwrap());
319                        }
320
321                        if i == next_index {
322                            if tx.send(result).is_err() {
323                                break; // Main receiver disconnected
324                            }
325                            next_index += 1;
326                            while let Some(result) = buffer.remove(&next_index) {
327                                if tx.send(result).is_err() {
328                                    break;
329                                }
330                                next_index += 1;
331                            }
332                        } else {
333                            buffer.insert(i, result);
334                        }
335                    }
336                    Err(_) => {
337                        // Channel closed, producer is done.
338                        break;
339                    }
340                }
341            }
342        });
343
344        Ok(())
345    }
346
347    fn compare_unordered_streaming(
348        &self,
349        tx: mpsc::Sender<(usize, FileComparisonResult)>,
350    ) -> anyhow::Result<()> {
351        let (dir1_files, dir2_files) = rayon::join(
352            || {
353                info!("Scanning directory: {:?}", self.dir1);
354                Self::get_files(&self.dir1)
355            },
356            || {
357                info!("Scanning directory: {:?}", self.dir2);
358                Self::get_files(&self.dir2)
359            },
360        );
361        let dir1_files = dir1_files?;
362        let dir2_files = dir2_files?;
363
364        let mut all_rel_paths: Vec<_> = dir1_files
365            .keys()
366            .cloned()
367            .chain(dir2_files.keys().cloned())
368            .collect();
369        all_rel_paths.sort();
370        all_rel_paths.dedup();
371
372        *self.total_files.lock().unwrap() = all_rel_paths.len();
373
374        all_rel_paths
375            .into_par_iter()
376            .enumerate()
377            .for_each(|(i, rel_path)| {
378                let in_dir1 = dir1_files.get(&rel_path);
379                let in_dir2 = dir2_files.get(&rel_path);
380
381                let result = match (in_dir1, in_dir2) {
382                    (Some(_), None) => {
383                        FileComparisonResult::new(rel_path.clone(), Classification::OnlyInDir1)
384                    }
385                    (None, Some(_)) => {
386                        FileComparisonResult::new(rel_path.clone(), Classification::OnlyInDir2)
387                    }
388                    (Some(path1), Some(path2)) => {
389                        let mut result =
390                            FileComparisonResult::new(rel_path.clone(), Classification::InBoth);
391                        if let Err(error) = result.update(path1, path2, self.buffer_size) {
392                            log::error!("Error during comparison of {:?}: {}", rel_path, error);
393                        }
394                        result
395                    }
396                    (None, None) => unreachable!(),
397                };
398                if tx.send((i, result)).is_err() {
399                    log::error!("Receiver dropped, stopping comparison of {:?}", rel_path);
400                }
401            });
402        Ok(())
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use std::io::Write;
410    use tempfile::NamedTempFile;
411
412    #[test]
413    fn test_compare_contents_identical() -> io::Result<()> {
414        let mut f1 = NamedTempFile::new()?;
415        let mut f2 = NamedTempFile::new()?;
416        f1.write_all(b"hello world")?;
417        f2.write_all(b"hello world")?;
418        assert!(FileComparisonResult::compare_contents(
419            f1.path(),
420            f2.path(),
421            8192
422        )?);
423        Ok(())
424    }
425
426    #[test]
427    fn test_compare_contents_different() -> io::Result<()> {
428        let mut f1 = NamedTempFile::new()?;
429        let mut f2 = NamedTempFile::new()?;
430        f1.write_all(b"hello world")?;
431        f2.write_all(b"hello rust")?;
432        assert!(!FileComparisonResult::compare_contents(
433            f1.path(),
434            f2.path(),
435            8192
436        )?);
437        Ok(())
438    }
439
440    #[test]
441    fn test_compare_contents_different_size() -> io::Result<()> {
442        let mut f1 = NamedTempFile::new()?;
443        let mut f2 = NamedTempFile::new()?;
444        f1.write_all(b"hello world")?;
445        f2.write_all(b"hello")?;
446        // compare_contents assumes same size, but let's see what it does
447        assert!(!FileComparisonResult::compare_contents(
448            f1.path(),
449            f2.path(),
450            8192
451        )?);
452        Ok(())
453    }
454
455    #[test]
456    fn test_comparison_summary() {
457        let mut summary = ComparisonSummary::default();
458        let res1 = FileComparisonResult::new(PathBuf::from("a"), Classification::OnlyInDir1);
459        let res2 = FileComparisonResult::new(PathBuf::from("b"), Classification::OnlyInDir2);
460        let mut res3 = FileComparisonResult::new(PathBuf::from("c"), Classification::InBoth);
461        res3.modified_time_comparison = Some(Ordering::Greater);
462
463        summary.update(&res1);
464        summary.update(&res2);
465        summary.update(&res3);
466
467        assert_eq!(summary.only_in_dir1, 1);
468        assert_eq!(summary.only_in_dir2, 1);
469        assert_eq!(summary.in_both, 1);
470        assert_eq!(summary.dir1_newer, 1);
471    }
472
473    #[test]
474    fn test_directory_comparer_integration() -> anyhow::Result<()> {
475        let dir1 = tempfile::tempdir()?;
476        let dir2 = tempfile::tempdir()?;
477
478        // Create files in dir1
479        let file1_path = dir1.path().join("same.txt");
480        let mut file1 = fs::File::create(&file1_path)?;
481        file1.write_all(b"same content")?;
482
483        let only1_path = dir1.path().join("only1.txt");
484        let mut only1 = fs::File::create(&only1_path)?;
485        only1.write_all(b"only in dir1")?;
486
487        // Create files in dir2
488        let file2_path = dir2.path().join("same.txt");
489        let mut file2 = fs::File::create(&file2_path)?;
490        file2.write_all(b"same content")?;
491
492        let only2_path = dir2.path().join("only2.txt");
493        let mut only2 = fs::File::create(&only2_path)?;
494        only2.write_all(b"only in dir2")?;
495
496        // Create a different file
497        let diff1_path = dir1.path().join("diff.txt");
498        let mut diff1 = fs::File::create(&diff1_path)?;
499        diff1.write_all(b"content 1")?;
500
501        let diff2_path = dir2.path().join("diff.txt");
502        let mut diff2 = fs::File::create(&diff2_path)?;
503        diff2.write_all(b"content 222")?; // different length and content
504
505        let comparer = DirectoryComparer::new(dir1.path().to_path_buf(), dir2.path().to_path_buf());
506        let (tx, rx) = mpsc::channel();
507
508        comparer.compare_streaming(tx)?;
509
510        let mut results = Vec::new();
511        while let Ok(res) = rx.recv() {
512            results.push(res);
513        }
514
515        results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
516
517        assert_eq!(results.len(), 4);
518
519        // diff.txt
520        assert_eq!(results[0].relative_path.to_str().unwrap(), "diff.txt");
521        assert_eq!(results[0].classification, Classification::InBoth);
522        assert!(
523            results[0].is_content_same == Some(false)
524                || results[0].size_comparison != Some(Ordering::Equal)
525        );
526
527        // only1.txt
528        assert_eq!(results[1].relative_path.to_str().unwrap(), "only1.txt");
529        assert_eq!(results[1].classification, Classification::OnlyInDir1);
530
531        // only2.txt
532        assert_eq!(results[2].relative_path.to_str().unwrap(), "only2.txt");
533        assert_eq!(results[2].classification, Classification::OnlyInDir2);
534
535        // same.txt
536        assert_eq!(results[3].relative_path.to_str().unwrap(), "same.txt");
537        assert_eq!(results[3].classification, Classification::InBoth);
538        assert_eq!(results[3].size_comparison, Some(Ordering::Equal));
539
540        Ok(())
541    }
542}