Skip to main content

compare_dir/
file_comparer.rs

1use crate::{FileHasher, FileItem, OutputFormat, SystemTimeExt};
2use indicatif::FormattedDuration;
3use std::cmp::Ordering;
4use std::fs;
5use std::io::Read;
6use std::path::PathBuf;
7use std::time::SystemTime;
8
9/// How a file is classified during comparison.
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum Classification {
12    /// File exists only in the first directory.
13    OnlyInDir1,
14    /// File exists only in the second directory.
15    OnlyInDir2,
16    /// File exists in both directories.
17    InBoth,
18}
19
20/// Compares the content of two files.
21pub struct FileComparer<'a> {
22    file1: &'a FileItem,
23    file2: &'a FileItem,
24    pub buffer_size: usize,
25    pub hashers: Option<(&'a FileHasher, &'a FileHasher)>,
26}
27
28impl<'a> FileComparer<'a> {
29    pub const DEFAULT_BUFFER_SIZE_KB: usize = 2 * 1024;
30    pub const DEFAULT_BUFFER_SIZE: usize = Self::DEFAULT_BUFFER_SIZE_KB * 1024;
31
32    pub fn new(file1: &'a FileItem, file2: &'a FileItem) -> Self {
33        Self {
34            file1,
35            file2,
36            buffer_size: Self::DEFAULT_BUFFER_SIZE,
37            hashers: None,
38        }
39    }
40
41    pub fn sizes(&self) -> (u64, u64) {
42        (self.file1.size(), self.file2.size())
43    }
44
45    pub fn modified(&self) -> (std::time::SystemTime, std::time::SystemTime) {
46        (self.file1.modified(), self.file2.modified())
47    }
48
49    pub(crate) fn compare_contents(&self) -> anyhow::Result<bool> {
50        let len1 = self.file1.size();
51        let len2 = self.file2.size();
52        if len1 != len2 {
53            return Ok(false);
54        }
55        if let Some((hasher1, hasher2)) = self.hashers {
56            let (hash1, hash2) = rayon::join(
57                || hasher1.get_hash(self.file1),
58                || hasher2.get_hash(self.file2),
59            );
60            return Ok(hash1? == hash2?);
61        }
62        if len1 == 0 {
63            // Early return after checking the hash, to ensure the hash cache is
64            // updated even if the size is 0.
65            return Ok(true);
66        }
67
68        let start_time = std::time::Instant::now();
69        let mut f1 = fs::File::open(self.file1.path())?;
70        let mut f2 = fs::File::open(self.file2.path())?;
71        if self.buffer_size == 0 {
72            let mmap1 = unsafe { memmap2::MmapOptions::new().map(&f1)? };
73            let mmap2 = unsafe { memmap2::MmapOptions::new().map(&f2)? };
74            let result = mmap1[..] == mmap2[..];
75            log::debug!(
76                "Compared in {}: '{}'",
77                FormattedDuration(start_time.elapsed()),
78                self.file1
79            );
80            return Ok(result);
81        }
82
83        let mut buf1 = vec![0u8; self.buffer_size];
84        let mut buf2 = vec![0u8; self.buffer_size];
85        loop {
86            // Safety from Deadlocks: rayon::join is specifically designed for nested parallelism.
87            // It uses work-stealing, meaning if all threads in the pool are busy, the thread
88            // calling join will just execute both tasks itself.
89            let (n1, n2) = rayon::join(|| f1.read(&mut buf1), || f2.read(&mut buf2));
90            let n1 = n1?;
91            let n2 = n2?;
92            if n1 != n2 || buf1[..n1] != buf2[..n2] {
93                log::debug!(
94                    "Compared in {}: '{}'",
95                    FormattedDuration(start_time.elapsed()),
96                    self.file1
97                );
98                return Ok(false);
99            }
100            if n1 == 0 {
101                log::debug!(
102                    "Compared in {}: '{}'",
103                    FormattedDuration(start_time.elapsed()),
104                    self.file1
105                );
106                return Ok(true);
107            }
108        }
109    }
110}
111
112/// Detailed result of comparing a single file.
113#[derive(Debug, Clone)]
114pub struct FileComparisonResult {
115    /// The path relative to the root of the directories.
116    pub relative_path: PathBuf,
117    /// Whether the file exists in one or both directories.
118    pub classification: Classification,
119    /// Comparison of the last modified time, if applicable.
120    pub modified_time_comparison: Option<Ordering>,
121    /// Comparison of the file size, if applicable.
122    pub size_comparison: Option<Ordering>,
123    /// Whether the content is byte-for-byte identical, if applicable.
124    pub is_content_same: Option<bool>,
125}
126
127impl FileComparisonResult {
128    pub fn new(relative_path: PathBuf, classification: Classification) -> Self {
129        Self {
130            relative_path,
131            classification,
132            modified_time_comparison: None,
133            size_comparison: None,
134            is_content_same: None,
135        }
136    }
137
138    pub fn update(
139        &mut self,
140        comparer: &FileComparer,
141        should_compare_content: bool,
142    ) -> anyhow::Result<()> {
143        let (t1, t2) = comparer.modified();
144        self.modified_time_comparison = Some(t1.cmp(&t2));
145
146        let (s1, s2) = comparer.sizes();
147        self.size_comparison = Some(s1.cmp(&s2));
148
149        if should_compare_content && s1 == s2 {
150            self.is_content_same = Some(comparer.compare_contents()?);
151        }
152        Ok(())
153    }
154
155    pub(crate) fn update_moodified(&mut self, t1: SystemTime, t2: SystemTime) {
156        self.modified_time_comparison = Some(if t1.eq_nearly(t2) {
157            Ordering::Equal
158        } else {
159            t1.cmp(&t2)
160        })
161    }
162
163    pub(crate) fn update_size(&mut self, s1: u64, s2: u64) {
164        self.size_comparison = Some(s1.cmp(&s2));
165    }
166
167    /// True if the two files are identical; i.e., modified times and sizes are
168    /// the same. Contents are the same too, or content comparison was skipped.
169    pub fn is_identical(&self) -> bool {
170        self.classification == Classification::InBoth
171            && self.modified_time_comparison == Some(Ordering::Equal)
172            && self.size_comparison == Some(Ordering::Equal)
173            && self.is_content_same != Some(false)
174    }
175
176    pub(crate) fn is_identical_content(&self) -> Option<bool> {
177        match self.size_comparison {
178            None | Some(Ordering::Equal) => self.is_content_same,
179            _ => Some(false),
180        }
181    }
182
183    pub(crate) fn print(&self, output_format: OutputFormat, dir1_name: &str, dir2_name: &str) {
184        match output_format {
185            OutputFormat::Default => println!(
186                "{}: {}",
187                self.relative_path.display(),
188                self.to_string(dir1_name, dir2_name)
189            ),
190            OutputFormat::Symbol => println!(
191                "{} {}",
192                self.to_symbol_string(),
193                self.relative_path.display()
194            ),
195            _ => unreachable!(),
196        }
197    }
198
199    pub fn to_symbol_string(&self) -> String {
200        String::from_iter([
201            match self.classification {
202                Classification::OnlyInDir1 => '>',
203                Classification::OnlyInDir2 => '<',
204                Classification::InBoth => '=',
205            },
206            match self.modified_time_comparison {
207                None => ' ',
208                Some(Ordering::Greater) => '>',
209                Some(Ordering::Less) => '<',
210                Some(Ordering::Equal) => '=',
211            },
212            match self.size_comparison {
213                None => ' ',
214                Some(Ordering::Greater) => '>',
215                Some(Ordering::Less) => '<',
216                Some(Ordering::Equal) => {
217                    if self.is_content_same == Some(false) {
218                        '!'
219                    } else {
220                        '='
221                    }
222                }
223            },
224        ])
225    }
226
227    pub fn to_string(&self, dir1_name: &str, dir2_name: &str) -> String {
228        let mut parts = Vec::new();
229        match self.classification {
230            Classification::OnlyInDir1 => parts.push(format!("Only in {}", dir1_name)),
231            Classification::OnlyInDir2 => parts.push(format!("Only in {}", dir2_name)),
232            Classification::InBoth => {}
233        }
234        let mut has_equals = false;
235        match self.modified_time_comparison {
236            Some(Ordering::Greater) => parts.push(format!("{} is newer", dir1_name)),
237            Some(Ordering::Less) => parts.push(format!("{} is newer", dir2_name)),
238            Some(Ordering::Equal) => has_equals = true,
239            None => {}
240        }
241        match self.size_comparison {
242            Some(Ordering::Greater) => parts.push(format!("Size of {} is larger", dir1_name)),
243            Some(Ordering::Less) => parts.push(format!("Size of {} is larger", dir2_name)),
244            Some(Ordering::Equal) => has_equals = true,
245            None => {}
246        }
247        match self.is_content_same {
248            Some(false) => parts.push("Contents differ".to_string()),
249            Some(true) => has_equals = true,
250            None => {}
251        }
252
253        if parts.is_empty() {
254            if !has_equals {
255                return "Unknown".to_string();
256            }
257            return "Identical".to_string();
258        }
259        parts.join(", ")
260    }
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266
267    fn check_compare(content1: &[u8], content2: &[u8], expected: bool) -> anyhow::Result<()> {
268        let dir1 = tempfile::tempdir()?;
269        let dir2 = tempfile::tempdir()?;
270        let file1_path = dir1.path().join("file");
271        let file2_path = dir2.path().join("file");
272        fs::write(&file1_path, content1)?;
273        fs::write(&file2_path, content2)?;
274        let file1 = FileItem::try_from(file1_path.as_path())?;
275        let file2 = FileItem::try_from(file2_path.as_path())?;
276
277        // Without hashers
278        let mut comparer = FileComparer::new(&file1, &file2);
279        comparer.buffer_size = 8192;
280        assert_eq!(comparer.compare_contents()?, expected);
281
282        // Use mmap without hashers
283        comparer.buffer_size = 0;
284        assert_eq!(comparer.compare_contents()?, expected);
285
286        // With hashers
287        let hasher1 = FileHasher::new_with_cache(&[dir1.path()])?;
288        let hasher2 = FileHasher::new_with_cache(&[dir2.path()])?;
289        comparer.hashers = Some((&hasher1, &hasher2));
290        assert_eq!(comparer.compare_contents()?, expected);
291
292        Ok(())
293    }
294
295    #[test]
296    fn compare_contents_identical() -> anyhow::Result<()> {
297        check_compare(b"hello world", b"hello world", true)
298    }
299
300    #[test]
301    fn compare_contents_different() -> anyhow::Result<()> {
302        check_compare(b"hello world", b"hello rust", false)
303    }
304
305    #[test]
306    fn compare_contents_different_size() -> anyhow::Result<()> {
307        check_compare(b"hello world", b"hello", false)
308    }
309
310    #[test]
311    fn compare_contents_empty_files() -> anyhow::Result<()> {
312        check_compare(b"", b"", true)
313    }
314
315    #[test]
316    fn comparison_result_empty() {
317        let result = FileComparisonResult::new(PathBuf::from("test.txt"), Classification::InBoth);
318        assert!(!result.is_identical());
319        assert_eq!(result.to_string("dir1", "dir2"), "Unknown");
320        assert_eq!(result.to_symbol_string(), "=  ");
321    }
322
323    #[test]
324    fn comparison_result_contents_skipped() {
325        let mut result =
326            FileComparisonResult::new(PathBuf::from("test.txt"), Classification::InBoth);
327        result.modified_time_comparison = Some(Ordering::Equal);
328        result.size_comparison = Some(Ordering::Equal);
329        assert!(result.is_identical());
330        assert_eq!(result.to_string("dir1", "dir2"), "Identical");
331        assert_eq!(result.to_symbol_string(), "===");
332    }
333}