add_determinism/linkdupes/
linkfiles.rs

1/* SPDX-License-Identifier: GPL-3.0-or-later */
2
3use anyhow::{bail, Error, Result};
4use log::{trace, debug, info, warn};
5
6use std::cmp::{min, Ordering};
7use std::hash::{DefaultHasher, Hasher};
8use std::io::Read;
9use std::os::unix::fs::{MetadataExt, PermissionsExt};
10use std::path::{Path, PathBuf};
11use std::cell::RefCell;
12use std::fs;
13
14use super::config::Config;
15#[cfg(feature = "selinux")]
16use super::fcontexts;
17
18#[derive(Debug, Default, PartialEq)]
19pub struct Stats {
20    /// Count of directories that were scanned. This includes both
21    /// command-line arguments and subdirectories found in recursive
22    /// processing.
23    pub directories: u64,
24
25    /// Count of file paths that were scanned. This includes both
26    /// command-line arguments and paths found in recursive
27    /// processing.
28    pub files: u64,
29
30    pub candidate_files: u64,
31
32    /// Count of files that we read or attempted to read
33    pub files_read: u64,
34
35    /// Count of files that we linked
36    pub files_linked: u64,
37
38    /// Count of files that couldn't be processed
39    pub errors: u64,
40
41    /// Summary of sizes of files that were linked
42    pub bytes_linked: u64,
43}
44
45impl Stats {
46    pub fn new() -> Self { Default::default() }
47
48    pub fn summarize(&self) {
49        println!(
50            "Scanned {} directories and {} files,\n    \
51            considered {} files, read {} files, linked {} files, {} errors\n    \
52            sum of sizes of linked files: {} bytes\
53            ",
54            self.directories, self.files,
55            self.candidate_files, self.files_read, self.files_linked, self.errors,
56            self.bytes_linked);
57    }
58}
59
60#[derive(Debug)]
61enum FileState {
62    None,
63    Open(fs::File),
64    Error,
65    Closed,
66}
67
68#[derive(Debug)]
69struct FileInfo {
70    path: PathBuf,
71    metadata: fs::Metadata,
72
73    #[cfg(feature = "selinux")]
74    selinux_context: RefCell<Option<String>>,
75
76    hashes: RefCell<Vec<u64>>,
77    file_state: RefCell<FileState>,
78}
79
80impl FileInfo {
81    fn new(path: PathBuf, metadata: fs::Metadata) -> FileInfo {
82        FileInfo {
83            path,
84            metadata,
85            #[cfg(feature = "selinux")]
86            selinux_context: RefCell::new(None),
87            hashes: RefCell::new(vec![]),
88            file_state: RefCell::new(FileState::None),
89        }
90    }
91
92    #[cfg(feature = "selinux")]
93    fn set_selinux_context(
94        &self,
95        labels: &selinux::label::Labeler<selinux::label::back_end::File>,
96        root: Option<&Path>,
97    ) -> Result<()> {
98
99        let mut context = self.selinux_context.borrow_mut();
100
101        if context.is_none() {
102            let fc = fcontexts::lookup_context(labels, root, &self.path)?;
103            context.replace(fc);
104        }
105
106        Ok(())
107    }
108
109    fn compare(
110        &self,
111        other: &FileInfo,
112        config: &Config,
113    ) -> Ordering {
114        // Return LT, EQ, or GT for the comparison of the two files.
115        // We may fail to read one or both of the files.
116        // In that case, say that they are *not equal*, and the one with
117        // the higher inode number is greater.
118
119        let ms = &self.metadata;
120        let mo = &other.metadata;
121
122        // If files have different size, the contents are different by definition.
123        let mut partial = ms.len().cmp(&mo.len());
124        if partial != Ordering::Equal {
125            trace!("Comparing {} and {} → size={:?}", self.path.display(), other.path.display(), partial);
126            return partial;
127        }
128
129        partial = ms.dev().cmp(&mo.dev());
130        if partial != Ordering::Equal {
131            trace!("Comparing {} and {} → filesystem={:?}", self.path.display(), other.path.display(), partial);
132            return partial;
133        }
134
135        // If files point at the same inode, the contents are equal by definition.
136        let ino_res = ms.ino().cmp(&mo.ino());
137        if ino_res == Ordering::Equal {
138            trace!("Comparing {} and {} → inode={:?}", self.path.display(), other.path.display(), partial);
139            return ino_res;
140        }
141
142        if !config.ignore_mode {
143            partial = ms.permissions().mode().cmp(&mo.permissions().mode());
144            if partial != Ordering::Equal {
145                trace!("Comparing {} and {} → mode={:?}", self.path.display(), other.path.display(), partial);
146                return partial;
147            }
148        }
149
150        if !config.ignore_owner {
151            partial = ms.uid().cmp(&mo.uid());
152            if partial != Ordering::Equal {
153                trace!("Comparing {} and {} → uid={:?}", self.path.display(), other.path.display(), partial);
154                return partial;
155            }
156
157            partial = ms.gid().cmp(&mo.gid());
158            if partial != Ordering::Equal {
159                trace!("Comparing {} and {} → gid={:?}", self.path.display(), other.path.display(), partial);
160                return partial;
161            }
162        }
163
164        if !config.ignore_mtime {
165            // mtime is clamped to $SOURCE_DATE_EPOCH, if set.
166            let mut t1 = ms.modified().expect("query mtime");
167            if let Some(s) = config.source_date_epoch.filter(|s| s < &t1) {
168                t1 = s;
169            }
170
171            let mut t2 = mo.modified().expect("query mtime");
172            if let Some(s) = config.source_date_epoch.filter(|s| s < &t2) {
173                t2 = s;
174            }
175
176            partial = t1.cmp(&t2);
177            if partial != Ordering::Equal {
178                trace!("Comparing {} and {} → mtime={:?}", self.path.display(), other.path.display(), partial);
179                return partial;
180            }
181        }
182
183        // Do SELinux context comparison.
184        // The labels are available iff the check wasn't turned off and files are found.
185        #[cfg(feature = "selinux")]
186        if let Some(labels) = config.selinux_labels.as_ref() {
187            if let Err(e) = self.set_selinux_context(labels, config.root.as_deref()) {
188                return FileInfo::file_error(ino_res, e, config);
189            }
190            if let Err(e) = other.set_selinux_context(labels, config.root.as_deref()) {
191                return FileInfo::file_error(ino_res, e, config);
192            }
193
194            let c1 = self.selinux_context.borrow();
195            let c2 = other.selinux_context.borrow();
196
197            partial = c1.cmp(&c2);
198            if partial != Ordering::Equal {
199                debug!("Comparing {} and {} → {} and {}, fcontext={:?}",
200                       self.path.display(), other.path.display(),
201                       c1.as_deref().unwrap_or("<<none>>"),
202                       c2.as_deref().unwrap_or("<<none>>"),
203                       partial);
204                return partial;
205            }
206        }
207
208        // If the file is empty, we don't need to open it to compare.
209        if ms.len() == 0 {
210            trace!("Comparing {} and {} → size=0, {:?}",
211                   self.path.display(), other.path.display(), Ordering::Equal);
212            return Ordering::Equal;
213        }
214
215        for i in 0.. {
216            let hash1 = match self.get_hash(i) {
217                Err(e) => { return FileInfo::file_error(ino_res, e, config); }
218                Ok(hash) => hash,
219            };
220
221            let hash2 = match other.get_hash(i) {
222                Err(e) => { return FileInfo::file_error(ino_res, e, config); }
223                Ok(hash) => hash,
224            };
225
226            let res = hash1.cmp(&hash2);
227            if res != Ordering::Equal {
228                trace!("Comparing {} and {} → hash{}={:?}",
229                       self.path.display(), other.path.display(), i, partial);
230                return res;
231            }
232
233            if hash1.is_none() && hash2.is_none() {
234                // Both files have been read
235                trace!("Comparing {} and {} → contents={:?}",
236                       self.path.display(), other.path.display(), Ordering::Equal);
237                return Ordering::Equal;
238            }
239        }
240
241        unreachable!();
242    }
243
244    fn file_error(partial: Ordering, _err: Error, config: &Config) -> Ordering {
245        // Either exit the program or return a partial result,
246        // depending on what Config says.
247        if config.fatal_errors {
248            std::process::exit(1);
249        } else {
250            partial
251        }
252    }
253
254    fn hash_chunk_size(previous_chunk_count: usize) -> u64 {
255        4096u64 * 2u64.pow(min(previous_chunk_count, 8) as u32)
256    }
257
258    fn get_hash(&self, index: usize) -> Result<Option<u64>> {
259        if let Some(val) = self.hashes.borrow().get(index) {
260            return Ok(Some(*val));
261        }
262
263        // We always read the partial hashes one by one, so get_hash()
264        // should never jump over an index.
265        assert!(index <= self.hashes.borrow().len());
266
267        self.get_next_hash()
268    }
269
270    fn get_next_hash(&self) -> Result<Option<u64>> {
271        // Calculate the hash for the next range of bytes.
272        // If already at the end of the file, return None.
273
274        // try to calculate the next hash
275        let mut file_state = self.file_state.borrow_mut();
276
277        match *file_state {
278            FileState::None => {
279                // Open file, store the error if encountered.
280                match fs::File::open(&self.path) {
281                    Ok(f) => {
282                        *file_state = FileState::Open(f);
283                    }
284                    Err(e) => {
285                        warn!("{}: open failed: {}", self.path.display(), e);
286                        *file_state = FileState::Error;
287                        return Err(e.into());
288                    }
289                }
290            }
291            FileState::Error => { bail!("{} is unreadable", self.path.display()); }
292            FileState::Closed => { return Ok(None); }
293            _ => {}
294        };
295
296        let file = match *file_state {
297            FileState::Open(ref f) => { f }
298            _ => { panic!() }
299        };
300
301        let mut hashes = self.hashes.borrow_mut();
302        let chunk_size = Self::hash_chunk_size(hashes.len());
303
304        let mut buffer = Vec::new();
305
306        let count = match file.take(chunk_size).read_to_end(&mut buffer) {
307            Ok(count) => count,
308            Err(e) => {
309                warn!("{}: read failed: {}", self.path.display(), e);
310                *file_state = FileState::Error;
311                return Err(e.into());
312            }
313        };
314
315        if (count as u64) < chunk_size {
316            *file_state = FileState::Closed;
317        }
318
319        if count == 0 {
320            return Ok(None);
321        }
322
323        let mut hasher = DefaultHasher::new();
324        hasher.write(&buffer[..count]);
325        let hash = hasher.finish();
326        hashes.push(hash);
327        Ok(Some(hash))
328    }
329}
330
331fn process_file_or_dir(
332    files_seen: &mut Vec<FileInfo>,
333    input_path: &Path,
334    config: &Config,
335    stats: &mut Stats,
336) -> Result<()> {
337
338    for entry in walkdir::WalkDir::new(input_path)
339        .follow_links(false)
340        .into_iter() {
341            let entry = match entry {
342                Err(e) => {
343                    stats.errors += 1;
344                    if config.fatal_errors {
345                        return Err(e.into());
346                    } else {
347                        warn!("Failed to process {}: {}", input_path.display(), e);
348                        continue;
349                    }
350                }
351                Ok(entry) => entry
352            };
353
354            let metadata = match entry.metadata() {
355                Err(e) => {
356                    stats.errors += 1;
357                    if config.fatal_errors {
358                        return Err(e.into());
359                    } else {
360                        warn!("{}: failed to stat: {}", entry.path().display(), e);
361                        continue;
362                    }
363                }
364                Ok(metadata) => metadata
365            };
366
367            if metadata.is_dir() {
368                stats.directories += 1;
369                continue;
370            }
371
372            stats.files += 1;
373
374            if !metadata.is_file() {
375                debug!("{}: not a file", entry.path().display());
376                continue;
377            }
378
379            stats.candidate_files += 1;
380            files_seen.push(FileInfo::new(entry.path().to_path_buf(), metadata));
381        }
382
383    Ok(())
384}
385
386fn link_file(a: &FileInfo, b: &FileInfo, config: &Config) -> Result<bool> {
387    // TODO: what happens if we have files a↔b, c↔d,
388    // and then we link a←c. We should also link a←d.
389
390    if a.metadata.ino() == b.metadata.ino() {
391        debug!("Already linked: {} and {}", a.path.display(), b.path.display());
392        return Ok(false);
393    }
394
395    // Check that b hasn't been modified in the meantime, e.g. by
396    // us under a different name.
397    let md = b.path.symlink_metadata()?;
398    if md.ino() != b.metadata.ino() {
399        debug!("Ignoring changed {}", b.path.display());
400        return Ok(false);
401    }
402
403    if config.dry_run {
404        info!("Would link {} ← {}", a.path.display(), b.path.display());
405    } else {
406        let tmp = b.path.with_file_name(format!(".#.{}.tmp", b.path.file_name().unwrap().to_str().unwrap()));
407        fs::hard_link(&a.path, &tmp)?;
408        if let Err(e) = fs::rename(&tmp, &b.path) {
409            // clean up temporary file
410            if let Err(g) = fs::remove_file(&tmp) {
411                warn!("Removal of temporary file {} failed: {}", tmp.display(), g);
412            };
413            return Err(e.into());
414        }
415
416        info!("Linked {} ← {}", a.path.display(), b.path.display());
417    }
418
419    Ok(true)
420}
421
422fn link_files(
423    files: Vec<FileInfo>,
424    config: &Config,
425    stats: &mut Stats,
426) -> Result<()> {
427    let mut linkto: Option<usize> = None;
428
429    // index is used as a workaround here. I expected .into_iter() to give me
430    // an object that I can put in linkto. But then the compiler says that the
431    // reference outlives the scope. No idea how to go from a reference to the
432    // actual object.
433
434    // We update the statistics on files here. We're iterating over the files
435    // anyway, so we can do that with very little overhead.
436
437    for (n, finfo) in files.iter().enumerate() {
438        if matches!(*finfo.file_state.borrow(), FileState::Error) {
439            stats.errors += 1;
440        }
441
442        if !matches!(*finfo.file_state.borrow(), FileState::None) {
443            stats.files_read += 1;
444        }
445
446        #[allow(clippy::unnecessary_unwrap)]
447        if linkto.is_some() &&
448           FileInfo::compare(&files[linkto.unwrap()], finfo, config) == Ordering::Equal {
449
450            match link_file(&files[linkto.unwrap()], finfo, config) {
451                Ok(res) => {
452                    if res {
453                        stats.files_linked += 1;
454                        // TODO: how to correctly count the case when the linked file was already linked
455                        stats.bytes_linked += finfo.metadata.len();
456                    }
457                }
458                Err(e) => {
459                    if config.fatal_errors {
460                        return Err(e);
461                    } else {
462                        stats.errors += 1;
463                        warn!("{}: failed to link to {}: {}",
464                              files[linkto.unwrap()].path.display(), finfo.path.display(), e);
465                    }
466                }
467            }
468        } else if let FileState::Error = *finfo.file_state.borrow() {
469            trace!("Skipping over {} with error…", finfo.path.display());
470
471        } else {
472            linkto = Some(n);
473        }
474    }
475
476    Ok(())
477}
478
479pub fn process_inputs(config: &Config) -> Result<Stats> {
480    let mut files_seen = vec![];
481    let mut stats = Stats::new();
482
483    for input_path in &config.inputs {
484        process_file_or_dir(&mut files_seen, input_path, config, &mut stats)?;
485    }
486
487    files_seen.sort_by(|a, b| FileInfo::compare(a, b, config));
488
489    link_files(files_seen, config, &mut stats)?;
490
491    Ok(stats)
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497    use std::io::Write;
498
499    #[test]
500    fn compare_metadata() {
501        let mut config = Config::empty();
502
503        let mut file1 = tempfile::NamedTempFile::new().unwrap();
504        let mut file2 = tempfile::NamedTempFile::new().unwrap();
505
506        file1.write(b"0").unwrap();
507        file2.write(b"0").unwrap();
508
509        let ts = file2.as_file().metadata().unwrap().modified().unwrap();
510        file1.as_file().set_modified(ts).unwrap();
511
512        let a = FileInfo {
513            path: file1.path().to_path_buf(),
514            metadata: fs::metadata(file1.path()).unwrap(),
515            #[cfg(feature = "selinux")]
516            selinux_context: RefCell::new(None),
517            hashes: vec![1, 2, 3, 4].into(),
518            file_state: FileState::Closed.into(),
519        };
520
521        let b = FileInfo {
522            path: file2.path().to_path_buf(),
523            metadata: fs::metadata(file2.path()).unwrap(),
524            #[cfg(feature = "selinux")]
525            selinux_context: RefCell::new(None),
526            hashes: vec![1, 2, 3, 4].into(),
527            file_state: FileState::Closed.into(),
528        };
529
530        assert_eq!(a.compare(&b, &config), Ordering::Equal);
531
532        b.hashes.borrow_mut().push(5);
533
534        assert_eq!(a.compare(&b, &config), Ordering::Less);
535
536        a.hashes.borrow_mut().push(6);
537
538        assert_eq!(a.compare(&b, &config), Ordering::Greater);
539
540        let a_again = FileInfo {
541            path: "/a/b/c".into(),
542            metadata: a.metadata.clone(),
543            #[cfg(feature = "selinux")]
544            selinux_context: RefCell::new(None),
545            hashes: vec![].into(),
546            file_state: FileState::None.into(),
547        };
548
549        assert_eq!(a.compare(&a_again, &config), Ordering::Equal);
550
551        // Make mtimes smaller
552        file2.as_file().set_modified(
553            ts + time::Duration::new(-30i64, 123)
554        ).unwrap();
555
556        let mut b_again = FileInfo {
557            path: file2.path().to_path_buf(),
558            metadata: fs::metadata(file2.path()).unwrap(),
559            #[cfg(feature = "selinux")]
560            selinux_context: RefCell::new(None),
561            hashes: a.hashes.borrow().clone().into(),
562            file_state: FileState::Closed.into(),
563        };
564
565        assert_eq!(a.compare(&b_again, &config), Ordering::Greater);
566
567        // Make mtimes larger
568        file2.as_file().set_modified(
569            ts + time::Duration::new(30i64, 123)
570        ).unwrap();
571
572        b_again.metadata = fs::metadata(file2.path()).unwrap();
573
574        assert_eq!(a.compare(&b_again, &config), Ordering::Less);
575
576        // Ignore mtimes
577        config.ignore_mtime = true;
578
579        assert_eq!(a.compare(&b_again, &config), Ordering::Equal);
580
581        // Set $SOURCE_DATE_EPOCH
582        config.ignore_mtime = false;
583        config.source_date_epoch = Some(ts);
584
585        assert_eq!(a.compare(&b_again, &config), Ordering::Equal);
586    }
587
588    #[test]
589    fn compare_different_fs() {
590        let config = Config::empty();
591
592        let a = FileInfo {
593            path: "/dev".into(),
594            metadata: fs::metadata("/dev").unwrap(),
595            #[cfg(feature = "selinux")]
596            selinux_context: RefCell::new(None),
597            hashes: vec![].into(),
598            file_state: FileState::Closed.into(),
599        };
600
601        let b = FileInfo {
602            path: "/proc".into(),
603            metadata: fs::metadata("/proc").unwrap(),
604            #[cfg(feature = "selinux")]
605            selinux_context: RefCell::new(None),
606            hashes: vec![].into(),
607            file_state: FileState::Closed.into(),
608        };
609
610        assert_ne!(a.compare(&b, &config), Ordering::Equal);
611    }
612
613    #[test]
614    #[cfg(feature = "selinux")]
615    fn compare_selinux_contexts() {
616        let labels = match selinux::label::Labeler::new(&[], false) {
617            Err(e) => {
618                info!("Failed to initalize SELinux db: {}", e);
619                return;
620            }
621            Ok(v) => v,
622        };
623
624        let mut config = Config::empty();
625        config.selinux_labels.replace(labels);
626
627        let mut file1 = tempfile::NamedTempFile::new().unwrap();
628        let mut file2 = tempfile::NamedTempFile::new().unwrap();
629
630        file1.write(b"0").unwrap();
631        file2.write(b"0").unwrap();
632
633        let ts = file2.as_file().metadata().unwrap().modified().unwrap();
634        file1.as_file().set_modified(ts).unwrap();
635
636        let a = FileInfo {
637            path: file1.path().to_path_buf(),
638            metadata: fs::metadata(file1.path()).unwrap(),
639            selinux_context: RefCell::new(None),
640            hashes: vec![5, 6, 7].into(),
641            file_state: FileState::Closed.into(),
642        };
643
644        let b = FileInfo {
645            path: file2.path().to_path_buf(),
646            metadata: fs::metadata(file2.path()).unwrap(),
647            selinux_context: RefCell::new(None),
648            hashes: vec![5, 6, 7].into(),
649            file_state: FileState::Closed.into(),
650        };
651
652        assert_eq!(a.compare(&b, &config), Ordering::Equal);
653        a.selinux_context.borrow_mut().replace("aaa".to_owned());
654        assert_eq!(a.compare(&b, &config), Ordering::Greater);
655        b.selinux_context.borrow_mut().replace("bbb".to_owned());
656        assert_eq!(a.compare(&b, &config), Ordering::Less);
657        b.selinux_context.borrow_mut().replace("aaa".to_owned());
658        assert_eq!(a.compare(&b, &config), Ordering::Equal);
659    }
660
661    #[test]
662    fn compare_unreadable() {
663        let mut config = Config::empty();
664        config.ignore_mtime = true;
665
666        let mut file1 = tempfile::NamedTempFile::new().unwrap();
667        let mut file2 = tempfile::NamedTempFile::new().unwrap();
668
669        file1.write(b"0").unwrap();
670        file2.write(b"0").unwrap();
671
672        fs::set_permissions(file1.path(), fs::Permissions::from_mode(0u32)).unwrap();
673        fs::set_permissions(file2.path(), fs::Permissions::from_mode(0u32)).unwrap();
674
675        let a = FileInfo {
676            path: file1.path().to_path_buf(),
677            metadata: fs::metadata(file1.path()).unwrap(),
678            #[cfg(feature = "selinux")]
679            selinux_context: RefCell::new(None),
680            hashes: vec![].into(),
681            file_state: FileState::None.into(),
682        };
683
684        let b = FileInfo {
685            path: file2.path().to_path_buf(),
686            metadata: fs::metadata(file2.path()).unwrap(),
687            #[cfg(feature = "selinux")]
688            selinux_context: RefCell::new(None),
689            hashes: vec![].into(),
690            file_state: FileState::None.into(),
691        };
692
693        let amiroot = fs::metadata("/proc/self/cmdline").unwrap().uid() == 0;
694        let expected = if amiroot {
695            Ordering::Equal
696        } else {
697            a.metadata.ino().cmp(&b.metadata.ino())
698        };
699        assert_eq!(a.compare(&b, &config), expected);
700    }
701
702    #[test]
703    fn compare_contents() {
704        let mut config = Config::empty();
705        config.ignore_mtime = true;
706
707        let mut file1 = tempfile::NamedTempFile::new().unwrap();
708        let mut file2 = tempfile::NamedTempFile::new().unwrap();
709
710        for (size, chunk_count) in vec![(0, 0), (4, 1), (4092, 1), (4096, 2), (4096*9, 4)] {
711            if size > 0 {
712                let data = Vec::from_iter(std::iter::repeat_n(66u8, size));
713                file1.write(&data).unwrap();
714                file1.flush().unwrap();
715                file2.write(&data).unwrap();
716                file2.flush().unwrap();
717            }
718
719            let a = FileInfo {
720                path: file1.path().to_path_buf(),
721                metadata: fs::metadata(file1.path()).unwrap(),
722                #[cfg(feature = "selinux")]
723                selinux_context: RefCell::new(None),
724                hashes: vec![].into(),
725                file_state: FileState::None.into(),
726            };
727
728            let b = FileInfo {
729                path: file2.path().to_path_buf(),
730                metadata: fs::metadata(file2.path()).unwrap(),
731                #[cfg(feature = "selinux")]
732                selinux_context: RefCell::new(None),
733                hashes: vec![].into(),
734                file_state: FileState::None.into(),
735            };
736
737            assert_eq!(a.compare(&b, &config), Ordering::Equal);
738            assert_eq!(a.hashes.borrow().len(), chunk_count);
739            assert_eq!(b.hashes.borrow().len(), chunk_count);
740            assert_eq!(*a.hashes.borrow(), *b.hashes.borrow());
741            let _exp_state = if size > 0 { FileState::Closed } else { FileState::None };
742            assert!(matches!(a.file_state.borrow(), _exp_state));
743            assert!(matches!(b.file_state.borrow(), _exp_state));
744        }
745    }
746}