fclones/
dedupe.rs

1//! Removing redundant files.
2
3use std::cmp::{max, min, Reverse};
4use std::collections::HashMap;
5use std::fmt::{Display, Formatter};
6use std::hash::{Hash, Hasher};
7use std::io::{ErrorKind, Write};
8use std::ops::{Add, AddAssign};
9use std::sync::mpsc::channel;
10use std::sync::Arc;
11use std::time::SystemTime;
12use std::{fmt, fs, io};
13
14use chrono::{DateTime, FixedOffset, Local};
15use priority_queue::PriorityQueue;
16use rand::distributions::Alphanumeric;
17use rand::Rng;
18use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
19
20use crate::config::{DedupeConfig, Priority};
21use crate::device::DiskDevices;
22use crate::file::{FileId, FileLen, FileMetadata};
23use crate::group::{FileGroup, FileSubGroup};
24use crate::lock::FileLock;
25use crate::log::{Log, LogExt};
26use crate::path::Path;
27use crate::util::{max_result, min_result, try_sort_by_key};
28use crate::{Error, TIMESTAMP_FMT};
29
30/// Defines what to do with redundant files
31#[derive(Clone, Debug, PartialEq, Eq)]
32pub enum DedupeOp {
33    /// Removes redundant files.
34    Remove,
35    /// Moves redundant files to a different dir
36    Move(Arc<Path>),
37    /// Replaces redundant files with soft-links (ln -s on Unix).
38    SymbolicLink,
39    /// Replaces redundant files with hard-links (ln on Unix).
40    HardLink,
41    /// Reflink redundant files (cp --reflink=always, only some filesystems).
42    RefLink,
43}
44
45/// Convenience struct for holding a path to a file and its metadata together
46#[derive(Clone, Debug)]
47pub struct PathAndMetadata {
48    pub path: Path,
49    pub metadata: FileMetadata,
50}
51
52impl PathAndMetadata {
53    pub fn new(path: Path) -> io::Result<PathAndMetadata> {
54        let metadata = FileMetadata::new(&path).map_err(|e| {
55            io::Error::new(
56                e.kind(),
57                format!("Failed to read metadata of {}: {}", path.display(), e),
58            )
59        })?;
60        Ok(PathAndMetadata { metadata, path })
61    }
62}
63
64impl AsRef<PathAndMetadata> for PathAndMetadata {
65    fn as_ref(&self) -> &PathAndMetadata {
66        self
67    }
68}
69
70impl AsRef<Path> for PathAndMetadata {
71    fn as_ref(&self) -> &Path {
72        &self.path
73    }
74}
75
76impl AsRef<FileId> for PathAndMetadata {
77    fn as_ref(&self) -> &FileId {
78        self.metadata.as_ref()
79    }
80}
81
82impl From<PathAndMetadata> for Path {
83    fn from(value: PathAndMetadata) -> Self {
84        value.path
85    }
86}
87
88impl Display for PathAndMetadata {
89    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
90        f.pad(self.path.display().as_str())
91    }
92}
93
94/// Portable abstraction for commands used to remove duplicates
95#[derive(Debug)]
96pub enum FsCommand {
97    Remove {
98        file: PathAndMetadata,
99    },
100    Move {
101        source: PathAndMetadata,
102        target: Path,
103        use_rename: bool, // try to move the file directly by issuing fs rename command
104    },
105    SoftLink {
106        target: Arc<PathAndMetadata>,
107        link: PathAndMetadata,
108    },
109    HardLink {
110        target: Arc<PathAndMetadata>,
111        link: PathAndMetadata,
112    },
113    RefLink {
114        target: Arc<PathAndMetadata>,
115        link: PathAndMetadata,
116    },
117}
118
119impl FsCommand {
120    /// Obtains a lock to the file if lock == true.
121    fn maybe_lock(path: &Path, lock: bool) -> io::Result<Option<FileLock>> {
122        if lock {
123            match FileLock::new(path) {
124                Ok(lock) => Ok(Some(lock)),
125                Err(e) if e.kind() == ErrorKind::Unsupported => Ok(None),
126                Err(e) => Err(e),
127            }
128        } else {
129            Ok(None)
130        }
131    }
132
133    pub fn remove(path: &Path) -> io::Result<()> {
134        fs::remove_file(path.to_path_buf()).map_err(|e| {
135            io::Error::new(
136                e.kind(),
137                format!("Failed to remove file {}: {}", path.display(), e),
138            )
139        })
140    }
141
142    #[cfg(unix)]
143    fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> {
144        std::os::unix::fs::symlink(target, link)
145    }
146
147    #[cfg(windows)]
148    fn symlink_internal(target: &std::path::Path, link: &std::path::Path) -> io::Result<()> {
149        std::os::windows::fs::symlink_file(target, link)
150    }
151
152    fn symlink(target: &Path, link: &Path) -> io::Result<()> {
153        Self::symlink_internal(&target.to_path_buf(), &link.to_path_buf()).map_err(|e| {
154            io::Error::new(
155                e.kind(),
156                format!(
157                    "Failed to create symbolic link {} -> {}: {}",
158                    link.display(),
159                    target.display(),
160                    e
161                ),
162            )
163        })
164    }
165
166    fn hardlink(target: &Path, link: &Path) -> io::Result<()> {
167        fs::hard_link(target.to_path_buf(), link.to_path_buf()).map_err(|e| {
168            io::Error::new(
169                e.kind(),
170                format!(
171                    "Failed to create hard link {} -> {}: {}",
172                    link.display(),
173                    target.display(),
174                    e
175                ),
176            )
177        })
178    }
179
180    fn check_can_rename(source: &Path, target: &Path) -> io::Result<()> {
181        if target.to_path_buf().exists() {
182            return Err(io::Error::new(
183                ErrorKind::AlreadyExists,
184                format!(
185                    "Cannot move {} to {}: Target already exists",
186                    source.display(),
187                    target.display()
188                ),
189            ));
190        }
191        Ok(())
192    }
193
194    fn mkdirs(path: &Path) -> io::Result<()> {
195        fs::create_dir_all(path.to_path_buf()).map_err(|e| {
196            io::Error::new(
197                e.kind(),
198                format!("Failed to create directory {}: {}", path.display(), e),
199            )
200        })
201    }
202
203    /// Renames/moves a file from one location to another.
204    /// If the target exists, it would be overwritten.
205    pub fn unsafe_rename(source: &Path, target: &Path) -> io::Result<()> {
206        fs::rename(source.to_path_buf(), target.to_path_buf()).map_err(|e| {
207            io::Error::new(
208                e.kind(),
209                format!(
210                    "Failed to rename file from {} to {}: {}",
211                    source.display(),
212                    target.display(),
213                    e
214                ),
215            )
216        })
217    }
218
219    /// Copies a file from one location to another.
220    /// If the target exists, it would be overwritten.
221    fn unsafe_copy(source: &Path, target: &Path) -> io::Result<()> {
222        fs::copy(source.to_path_buf(), target.to_path_buf()).map_err(|e| {
223            io::Error::new(
224                e.kind(),
225                format!(
226                    "Failed to copy file from {} to {}: {}",
227                    source.display(),
228                    target.display(),
229                    e
230                ),
231            )
232        })?;
233        Ok(())
234    }
235
236    /// Moves the file from one location to another by single `fs::rename` command.
237    /// Fails if target exists.
238    fn move_rename(source: &Path, target: &Path) -> io::Result<()> {
239        Self::check_can_rename(source, target)?;
240        Self::mkdirs(target.parent().unwrap())?;
241        Self::unsafe_rename(source, target)?;
242        Ok(())
243    }
244
245    /// Moves the file by copying it first to another location and then removing the original.
246    /// Fails if target exists.
247    fn move_copy(source: &Path, target: &Path) -> io::Result<()> {
248        Self::check_can_rename(source, target)?;
249        Self::mkdirs(target.parent().unwrap())?;
250        Self::unsafe_copy(source, target)?;
251        Self::remove(source)?;
252        Ok(())
253    }
254
255    /// Returns a random temporary file name in the same directory, guaranteed to not collide with
256    /// any other file in the same directory
257    pub fn temp_file(path: &Path) -> Path {
258        let mut name = path
259            .file_name()
260            .expect("must be a regular file with a name");
261        name.push(".");
262        name.push(
263            rand::thread_rng()
264                .sample_iter(&Alphanumeric)
265                .take(24)
266                .map(char::from)
267                .collect::<String>(),
268        );
269        match path.parent() {
270            Some(parent) => parent.join(Path::from(name)),
271            None => Path::from(name),
272        }
273    }
274
275    /// Safely moves the file to a different location and invokes the function.
276    /// If the function fails, moves the file back to the original location.
277    /// If the function succeeds, removes the file permanently.
278    pub fn safe_remove<R>(
279        path: &Path,
280        f: impl FnOnce(&Path) -> io::Result<R>,
281        log: &dyn Log,
282    ) -> io::Result<R> {
283        let tmp = Self::temp_file(path);
284        Self::unsafe_rename(path, &tmp)?;
285        let result = match f(path) {
286            Ok(result) => result,
287            Err(e) => {
288                // Try to undo the move if possible
289                if let Err(remove_err) = Self::unsafe_rename(&tmp, path) {
290                    log.warn(format!(
291                        "Failed to undo move from {} to {}: {}",
292                        &path.display(),
293                        &tmp.display(),
294                        remove_err
295                    ))
296                }
297                return Err(e);
298            }
299        };
300        // Cleanup the temp file.
301        if let Err(e) = Self::remove(&tmp) {
302            log.warn(format!(
303                "Failed to remove temporary {}: {}",
304                &tmp.display(),
305                e
306            ))
307        }
308        Ok(result)
309    }
310
311    /// Executes the command and returns the number of bytes reclaimed
312    pub fn execute(&self, should_lock: bool, log: &dyn Log) -> io::Result<FileLen> {
313        match self {
314            FsCommand::Remove { file } => {
315                let _ = Self::maybe_lock(&file.path, should_lock)?;
316                Self::remove(&file.path)?;
317                Ok(file.metadata.len())
318            }
319            FsCommand::SoftLink { target, link } => {
320                let _ = Self::maybe_lock(&link.path, should_lock)?;
321                Self::safe_remove(&link.path, |link| Self::symlink(&target.path, link), log)?;
322                Ok(link.metadata.len())
323            }
324            FsCommand::HardLink { target, link } => {
325                let _ = Self::maybe_lock(&link.path, should_lock)?;
326                Self::safe_remove(&link.path, |link| Self::hardlink(&target.path, link), log)?;
327                Ok(link.metadata.len())
328            }
329            FsCommand::RefLink { target, link } => {
330                let _ = Self::maybe_lock(&link.path, should_lock)?;
331                crate::reflink::reflink(target, link, log)?;
332                Ok(link.metadata.len())
333            }
334            FsCommand::Move {
335                source,
336                target,
337                use_rename,
338            } => {
339                let _ = Self::maybe_lock(&source.path, should_lock);
340                let len = source.metadata.len();
341                if *use_rename && Self::move_rename(&source.path, target).is_ok() {
342                    return Ok(len);
343                }
344                Self::move_copy(&source.path, target)?;
345                Ok(len)
346            }
347        }
348    }
349
350    /// Returns the path to be affected by running the command.
351    /// For commands that move or remove the file, it returns the path to the file before removal.
352    /// For commands that create a link, returns the path to file that will be replaced by the link.
353    pub fn file_to_remove(&self) -> &Path {
354        match self {
355            FsCommand::Remove { file, .. }
356            | FsCommand::SoftLink { link: file, .. }
357            | FsCommand::HardLink { link: file, .. }
358            | FsCommand::RefLink { link: file, .. }
359            | FsCommand::Move { source: file, .. } => &file.path,
360        }
361    }
362
363    /// Returns how much disk space running this command would reclaim
364    pub fn space_to_reclaim(&self) -> FileLen {
365        match self {
366            FsCommand::Remove { file, .. }
367            | FsCommand::SoftLink { link: file, .. }
368            | FsCommand::HardLink { link: file, .. }
369            | FsCommand::RefLink { link: file, .. }
370            | FsCommand::Move { source: file, .. } => file.metadata.len(),
371        }
372    }
373
374    /// Formats the command as a string that can be pasted to a Unix shell (e.g. bash)
375    #[cfg(unix)]
376    pub fn to_shell_str(&self) -> Vec<String> {
377        let mut result = Vec::new();
378        match self {
379            FsCommand::Remove { file, .. } => {
380                let path = file.path.quote();
381                result.push(format!("rm {path}"));
382            }
383            FsCommand::SoftLink { target, link, .. } => {
384                let tmp = Self::temp_file(&link.path);
385                let target = target.path.quote();
386                let link = link.path.quote();
387                result.push(format!("mv {} {}", link, tmp.quote()));
388                result.push(format!("ln -s {target} {link}"));
389                result.push(format!("rm {}", tmp.quote()));
390            }
391            FsCommand::HardLink { target, link, .. } => {
392                let tmp = Self::temp_file(&link.path);
393                let target = target.path.quote();
394                let link = link.path.quote();
395                result.push(format!("mv {} {}", link, tmp.quote()));
396                result.push(format!("ln {target} {link}"));
397                result.push(format!("rm {}", tmp.quote()));
398            }
399            FsCommand::RefLink { target, link, .. } => {
400                let tmp = Self::temp_file(&link.path);
401                let target = target.path.quote();
402                let link = link.path.quote();
403                // Not really what happens on Linux, there the `mv` is also a reflink.
404                result.push(format!("mv {} {}", link, tmp.quote()));
405                if cfg!(target_os = "macos") {
406                    result.push(format!("cp -c {target} {link}"));
407                } else {
408                    result.push(format!("cp --reflink=always {target} {link}"));
409                };
410                result.push(format!("rm {}", tmp.quote()));
411            }
412            FsCommand::Move {
413                source,
414                target,
415                use_rename,
416            } => {
417                let source = source.path.quote();
418                let target = target.quote();
419                if *use_rename {
420                    result.push(format!("mv {} {}", &source, &target));
421                } else {
422                    result.push(format!("cp {} {}", &source, &target));
423                    result.push(format!("rm {}", &source));
424                }
425            }
426        }
427        result
428    }
429
430    #[cfg(windows)]
431    pub fn to_shell_str(&self) -> Vec<String> {
432        let mut result = Vec::new();
433        match self {
434            FsCommand::Remove { file, .. } => {
435                let path = file.path.quote();
436                result.push(format!("del {}", path));
437            }
438            FsCommand::SoftLink { target, link, .. } => {
439                let tmp = Self::temp_file(&link.path);
440                let target = target.path.quote();
441                let link = link.path.quote();
442                result.push(format!("move {} {}", link, tmp.quote()));
443                result.push(format!("mklink {} {}", target, link));
444                result.push(format!("del {}", tmp.quote()));
445            }
446            FsCommand::HardLink { target, link, .. } => {
447                let tmp = Self::temp_file(&link.path);
448                let target = target.path.quote();
449                let link = link.path.quote();
450                result.push(format!("move {} {}", link, tmp.quote()));
451                result.push(format!("mklink /H {} {}", target, link));
452                result.push(format!("del {}", tmp.quote()));
453            }
454            FsCommand::RefLink { target, link, .. } => {
455                result.push(format!(":: deduplicate {} {}", link, target));
456            }
457            FsCommand::Move {
458                source,
459                target,
460                use_rename,
461            } => {
462                let source = source.path.quote();
463                let target = target.quote();
464                if *use_rename {
465                    result.push(format!("move {} {}", &source, &target));
466                } else {
467                    result.push(format!("copy {} {}", &source, &target));
468                    result.push(format!("del {}", &source));
469                }
470            }
471        }
472        result
473    }
474}
475
476/// Provides information about the number of deduplicated files and reclaimed disk space
477#[derive(Default)]
478pub struct DedupeResult {
479    pub processed_count: u64,
480    pub reclaimed_space: FileLen,
481}
482
483impl Add<DedupeResult> for DedupeResult {
484    type Output = DedupeResult;
485
486    fn add(self, rhs: Self) -> Self::Output {
487        DedupeResult {
488            processed_count: self.processed_count + rhs.processed_count,
489            reclaimed_space: self.reclaimed_space + rhs.reclaimed_space,
490        }
491    }
492}
493
494impl AddAssign for DedupeResult {
495    fn add_assign(&mut self, rhs: Self) {
496        self.processed_count += rhs.processed_count;
497        self.reclaimed_space += rhs.reclaimed_space;
498    }
499}
500
501/// Returns true if any of the files have been modified after the given timestamp.
502/// Also returns true if file timestamp could not be read.
503fn was_modified(files: &[PathAndMetadata], after: DateTime<FixedOffset>, log: &dyn Log) -> bool {
504    let mut result = false;
505    let after: DateTime<Local> = after.into();
506    for PathAndMetadata {
507        path: p,
508        metadata: m,
509        ..
510    } in files.iter()
511    {
512        match m.modified() {
513            Ok(file_timestamp) => {
514                let file_timestamp: DateTime<Local> = file_timestamp.into();
515                if file_timestamp > after {
516                    log.warn(format!(
517                        "File {} was updated after {} (at {})",
518                        p.display(),
519                        after.format(TIMESTAMP_FMT),
520                        file_timestamp.format(TIMESTAMP_FMT)
521                    ));
522                    result = true;
523                }
524            }
525            Err(e) => {
526                log.warn(format!(
527                    "Failed to read modification time of file {}: {}",
528                    p.display(),
529                    e
530                ));
531                result = true;
532            }
533        }
534    }
535    result
536}
537
538/// Returns true if given path matches any of the `keep` patterns
539fn should_keep(path: &Path, config: &DedupeConfig) -> bool {
540    let matches_any_name = config
541        .keep_name_patterns
542        .iter()
543        .any(|p| match path.file_name_cstr() {
544            Some(name) => p.matches(name.to_string_lossy().as_ref()),
545            None => false,
546        });
547    let matches_any_path = || {
548        config
549            .keep_path_patterns
550            .iter()
551            .any(|p| p.matches_path(&path.to_path_buf()))
552    };
553
554    matches_any_name || matches_any_path()
555}
556
557/// Returns true if given path matches all of the `drop` patterns.
558/// If there are no `drop` patterns, returns true.
559fn may_drop(path: &Path, config: &DedupeConfig) -> bool {
560    let matches_any_name = || {
561        config
562            .name_patterns
563            .iter()
564            .any(|p| match path.file_name_cstr() {
565                Some(name) => p.matches(name.to_string_lossy().as_ref()),
566                None => false,
567            })
568    };
569    let matches_any_path = || {
570        config
571            .path_patterns
572            .iter()
573            .any(|p| p.matches_path(&path.to_path_buf()))
574    };
575
576    (config.name_patterns.is_empty() && config.path_patterns.is_empty())
577        || matches_any_name()
578        || matches_any_path()
579}
580
581impl<P: AsRef<PathAndMetadata>> FileSubGroup<P> {
582    /// Returns the time of the earliest creation of a file in the subgroup
583    pub fn created(&self) -> Result<SystemTime, Error> {
584        Ok(min_result(self.files.iter().map(|f| {
585            let f = f.as_ref();
586            f.metadata.created().map_err(|e| {
587                format!(
588                    "Failed to read creation time of file {}: {}",
589                    f.path.display(),
590                    e
591                )
592            })
593        }))?
594        .unwrap())
595    }
596
597    /// Returns the time of the latest modification of a file in the subgroup
598    pub fn modified(&self) -> Result<SystemTime, Error> {
599        Ok(max_result(self.files.iter().map(|f| {
600            let f = f.as_ref();
601            f.metadata.modified().map_err(|e| {
602                format!(
603                    "Failed to read modification time of file {}: {}",
604                    f.path.display(),
605                    e
606                )
607            })
608        }))?
609        .unwrap())
610    }
611
612    /// Returns the time of the latest access of a file in the subgroup
613    pub fn accessed(&self) -> Result<SystemTime, Error> {
614        Ok(max_result(self.files.iter().map(|f| {
615            let f = f.as_ref();
616            f.metadata.accessed().map_err(|e| {
617                format!(
618                    "Failed to read access time of file {}: {}",
619                    f.path.display(),
620                    e
621                )
622            })
623        }))?
624        .unwrap())
625    }
626
627    /// Returns the time of the latest status change of a file in the subgroup
628    #[cfg(unix)]
629    pub fn status_changed(&self) -> Result<(i64, i64), Error> {
630        use std::os::unix::fs::MetadataExt;
631        Ok(self
632            .files
633            .iter()
634            .map(|f| {
635                let f = f.as_ref();
636                (f.metadata.ctime(), f.metadata.ctime_nsec())
637            })
638            .max()
639            .unwrap())
640    }
641
642    /// Returns true if any of the files in the subgroup must be kept
643    pub fn should_keep(&self, config: &DedupeConfig) -> bool {
644        self.files
645            .iter()
646            .any(|f| should_keep(&f.as_ref().path, config))
647    }
648
649    /// Returns true if all files in the subgroup can be dropped
650    pub fn may_drop(&self, config: &DedupeConfig) -> bool {
651        self.files
652            .iter()
653            .all(|f| may_drop(&f.as_ref().path, config))
654    }
655
656    /// Returns the number of components of the least nested path
657    pub fn min_nesting(&self) -> usize {
658        self.files
659            .iter()
660            .map(|f| f.as_ref().path.component_count())
661            .min()
662            .unwrap()
663    }
664
665    /// Returns the number of components of the most nested path
666    pub fn max_nesting(&self) -> usize {
667        self.files
668            .iter()
669            .map(|f| f.as_ref().path.component_count())
670            .max()
671            .unwrap()
672    }
673}
674
675/// Sort files so that files with highest priority (newest, most recently updated,
676/// recently accessed, etc) are sorted last.
677/// In cases when metadata of a file cannot be accessed, an error message is pushed
678/// in the result vector and such file is placed at the beginning of the list.
679pub fn sort_by_priority<P>(files: &mut [FileSubGroup<P>], priority: &Priority) -> Vec<Error>
680where
681    P: AsRef<PathAndMetadata>,
682{
683    match priority {
684        Priority::Top => {
685            files.reverse();
686            vec![]
687        }
688        Priority::Bottom => vec![],
689        Priority::Newest => try_sort_by_key(files, |m| m.created()),
690        Priority::Oldest => try_sort_by_key(files, |m| m.created().map(Reverse)),
691        Priority::MostRecentlyModified => try_sort_by_key(files, |m| m.modified()),
692        Priority::LeastRecentlyModified => try_sort_by_key(files, |m| m.modified().map(Reverse)),
693        Priority::MostRecentlyAccessed => try_sort_by_key(files, |m| m.accessed()),
694        Priority::LeastRecentlyAccessed => try_sort_by_key(files, |m| m.accessed().map(Reverse)),
695        #[cfg(unix)]
696        Priority::MostRecentStatusChange => try_sort_by_key(files, |m| m.status_changed()),
697        #[cfg(unix)]
698        Priority::LeastRecentStatusChange => {
699            try_sort_by_key(files, |m| m.status_changed().map(Reverse))
700        }
701        Priority::MostNested => {
702            files.sort_by_key(|m| m.max_nesting());
703            vec![]
704        }
705        Priority::LeastNested => {
706            files.sort_by_key(|m| Reverse(m.min_nesting()));
707            vec![]
708        }
709    }
710}
711
712#[derive(Debug)]
713pub struct PartitionedFileGroup {
714    pub to_keep: Vec<PathAndMetadata>,
715    pub to_drop: Vec<PathAndMetadata>,
716}
717
718impl PartitionedFileGroup {
719    /// Returns the destination path where the file should be moved when the
720    /// dedupe mode was selected to move
721    fn move_target(target_dir: &Arc<Path>, source_path: &Path) -> Path {
722        let root = source_path
723            .root()
724            .map(|p| p.to_string_lossy().replace(['/', '\\', ':'], ""));
725        let suffix = source_path.strip_root();
726        match root {
727            None => target_dir.join(suffix),
728            Some(root) => Arc::new(target_dir.join(Path::from(root))).join(suffix),
729        }
730    }
731
732    fn are_on_same_mount(devices: &DiskDevices, file1: &Path, file2: &Path) -> bool {
733        let mount1 = devices.get_mount_point(file1);
734        let mount2 = devices.get_mount_point(file2);
735        mount1 == mount2
736    }
737
738    /// Returns a list of commands that would remove redundant files in this group when executed.
739    pub fn dedupe_script(mut self, strategy: &DedupeOp, devices: &DiskDevices) -> Vec<FsCommand> {
740        if self.to_drop.is_empty() {
741            return vec![];
742        }
743        assert!(
744            !self.to_keep.is_empty(),
745            "No files would be left after deduplicating"
746        );
747        let mut commands = Vec::new();
748        let retained_file = Arc::new(self.to_keep.swap_remove(0));
749        for dropped_file in self.to_drop {
750            match strategy {
751                DedupeOp::SymbolicLink => commands.push(FsCommand::SoftLink {
752                    target: retained_file.clone(),
753                    link: dropped_file,
754                }),
755                DedupeOp::HardLink => commands.push(FsCommand::HardLink {
756                    target: retained_file.clone(),
757                    link: dropped_file,
758                }),
759                DedupeOp::RefLink => commands.push(FsCommand::RefLink {
760                    target: retained_file.clone(),
761                    link: dropped_file,
762                }),
763                DedupeOp::Remove => commands.push(FsCommand::Remove { file: dropped_file }),
764                DedupeOp::Move(target_dir) => {
765                    let source = dropped_file;
766                    let source_path = &source.path;
767                    let use_rename = Self::are_on_same_mount(devices, source_path, target_dir);
768                    let target = Self::move_target(target_dir, source_path);
769                    commands.push(FsCommand::Move {
770                        source,
771                        target,
772                        use_rename,
773                    })
774                }
775            }
776        }
777        commands
778    }
779}
780
781/// Attempts to retrieve the metadata of all the files in the file group.
782/// If metadata is inaccessible for a file, a warning is emitted to the log, and None gets returned.
783fn fetch_files_metadata<P>(group: FileGroup<P>, log: &dyn Log) -> Option<FileGroup<PathAndMetadata>>
784where
785    P: Into<Path>,
786{
787    group
788        .try_map_all(|p| {
789            PathAndMetadata::new(p.into()).map_err(|e| {
790                log.warn(&e);
791            })
792        })
793        .ok()
794}
795
796/// Partitions a group of files into files to keep and files that can be safely dropped
797/// (or linked).
798fn partition(
799    group: FileGroup<PathAndMetadata>,
800    config: &DedupeConfig,
801    log: &dyn Log,
802) -> Result<PartitionedFileGroup, Error> {
803    let file_len = group.file_len;
804    let file_hash = group.file_hash.clone();
805    let mut files = group.files;
806
807    let error = |msg: &str| {
808        Err(Error::from(format!(
809            "Could not determine files to drop in group with hash {} and len {}: {}",
810            file_hash, file_len.0, msg
811        )))
812    };
813
814    // We don't want to remove dirs or symlinks
815    files.retain(|m| {
816        let is_file = m.metadata.is_file();
817        if !is_file {
818            log.warn(format!(
819                "Skipping file {}: Not a regular file",
820                m.path.display()
821            ));
822        }
823        is_file
824    });
825
826    // If file has a different length, then we really know it has been modified.
827    // Therefore, it does not belong to the group and we can safely skip it.
828    if !config.no_check_size {
829        files.retain(|m| {
830            let len_ok = m.metadata.len() == file_len;
831            if !len_ok {
832                log.warn(format!(
833                    "Skipping file {} with length {} different than the group length {}",
834                    m.path.display(),
835                    m.metadata.len(),
836                    file_len.0,
837                ));
838            }
839            len_ok
840        });
841    }
842
843    // Bail out as well if any file has been modified after `config.modified_before`.
844    // We need to skip the whole group, because we don't know if these files are really different.
845    if let Some(max_timestamp) = config.modified_before {
846        if was_modified(&files, max_timestamp, log) {
847            return error("Some files could be updated since the previous run of fclones");
848        }
849    }
850
851    let mut file_sub_groups =
852        FileSubGroup::group(files, &config.isolated_roots, !config.match_links);
853
854    // Sort files to remove in user selected order.
855    // The priorities at the beginning of the argument list have precedence over
856    // the priorities given at the end of the argument list, therefore we're applying
857    // them in reversed order.
858    let mut sort_errors = Vec::new();
859    for priority in config.priority.iter().rev() {
860        sort_errors.extend(sort_by_priority(&mut file_sub_groups, priority));
861    }
862
863    if !sort_errors.is_empty() {
864        for e in sort_errors {
865            log.warn(e);
866        }
867        return error("Metadata of some files could not be read.");
868    }
869
870    // Split the set of file subgroups into two sets - a set that we want to keep intact and a set
871    // that we can remove or replace with links:
872    let (mut to_retain, mut to_drop): (Vec<_>, Vec<_>) = file_sub_groups
873        .into_iter()
874        .partition(|m| m.should_keep(config) || !m.may_drop(config));
875
876    // If the set to retain is smaller than the number of files we must keep (rf), then
877    // move some higher priority files from `to_drop` and append them to `to_retain`.
878    let n = max(1, config.rf_over.unwrap_or(1));
879    let missing_count = min(to_drop.len(), n.saturating_sub(to_retain.len()));
880    to_retain.extend(to_drop.drain(0..missing_count));
881
882    assert!(to_retain.len() >= n || to_drop.is_empty());
883    Ok(PartitionedFileGroup {
884        to_keep: to_retain.into_iter().flat_map(|g| g.files).collect(),
885        to_drop: to_drop.into_iter().flat_map(|g| g.files).collect(),
886    })
887}
888
889/// Generates a list of commands that will remove the redundant files in the groups provided
890/// by the `groups` iterator.
891///
892/// Calling this is perfectly safe - the function does not perform any disk changes.
893///
894/// This function performs extensive checks if files can be removed.
895/// It rejects a group of files if:
896/// - metadata of any files in the group cannot be read,
897/// - any file in the group was modified after the `modified_before` configuration property
898///
899/// Additionally it will never emit commands to remove a file which:
900/// - has length that does not match the file length recorded in the group metadata
901/// - was matched by any of the `retain_path` or `retain_name` patterns
902/// - was not matched by all `drop_path` and `drop_name` patterns
903///
904/// The commands in the list are grouped into vectors where each
905/// vector has its sequential id. This id allows to convert the parallel iterator into a
906/// sequential iterator with the same order as the groups in the input file.
907/// Unfortunately Rayon does not allow to convert a parallel iterator
908/// to a sequential iterator easily, so we need this hack with prepending ids of each group.
909///
910/// # Parameters
911/// - `groups`: iterator over groups of identical files
912/// - `op`: what to do with duplicates
913/// - `config`: controls which files from each group to remove / link
914/// - `log`: logging target
915pub fn dedupe<'a, I, P>(
916    groups: I,
917    op: DedupeOp,
918    config: &'a DedupeConfig,
919    log: &'a dyn Log,
920) -> impl ParallelIterator<Item = (usize, Vec<FsCommand>)> + 'a
921where
922    I: IntoIterator<Item = FileGroup<P>> + 'a,
923    I::IntoIter: Send,
924    P: Into<Path> + AsRef<Path> + fmt::Debug + Send + 'a,
925{
926    let devices = DiskDevices::new(&HashMap::new());
927    let disallow_cross_device = op == DedupeOp::HardLink || op == DedupeOp::RefLink;
928    groups
929        .into_iter()
930        .enumerate()
931        .par_bridge()
932        .map(move |(i, group)| {
933            let mut commands = Vec::new();
934            if let Some(group) = fetch_files_metadata(group, log) {
935                let groups = if disallow_cross_device {
936                    group.partition_by_key(|p| p.metadata.device_id())
937                } else {
938                    vec![group]
939                };
940                for group in groups {
941                    match partition(group, config, log) {
942                        Ok(group) => commands.extend(group.dedupe_script(&op, &devices)),
943                        Err(e) => log.warn(e),
944                    }
945                }
946            }
947            (i, commands)
948        })
949}
950
951/// Runs a deduplication script generated by [`dedupe`].
952///
953/// Calling this function is going to change the contents of the file-system.
954/// No safety checks are performed.
955/// Commands are executed in parallel, on the default Rayon thread-pool.
956/// On command execution failure, a warning is logged and the execution of remaining commands
957/// continues.
958/// Returns the number of files processed and the amount of disk space reclaimed.
959pub fn run_script<I>(script: I, should_lock: bool, log: &dyn Log) -> DedupeResult
960where
961    I: IntoParallelIterator<Item = (usize, Vec<FsCommand>)>,
962{
963    script
964        .into_par_iter()
965        .flat_map(|(_, cmd_vec)| cmd_vec)
966        .map(|cmd| cmd.execute(should_lock, log))
967        .inspect(|res| {
968            if let Err(e) = res {
969                log.warn(e);
970            }
971        })
972        .filter_map(|res| res.ok())
973        .map(|len| DedupeResult {
974            processed_count: 1,
975            reclaimed_space: len,
976        })
977        .reduce(DedupeResult::default, |a, b| a + b)
978}
979
980/// We need this so we can put command vectors in a priority queue.
981struct FsCommandGroup {
982    index: usize,
983    commands: Vec<FsCommand>,
984}
985
986impl FsCommandGroup {
987    pub fn new(index: usize, commands: Vec<FsCommand>) -> FsCommandGroup {
988        FsCommandGroup { index, commands }
989    }
990}
991
992impl PartialEq<Self> for FsCommandGroup {
993    fn eq(&self, other: &Self) -> bool {
994        self.index == other.index
995    }
996}
997
998impl Eq for FsCommandGroup {}
999
1000impl Hash for FsCommandGroup {
1001    fn hash<H: Hasher>(&self, state: &mut H) {
1002        self.index.hash(state)
1003    }
1004}
1005
1006/// Prints a script generated by [`dedupe`] to stdout.
1007///
1008/// Does not perform any filesystem changes.
1009/// Returns the number of files processed and the amount of disk space that would be
1010/// reclaimed if all commands of the script were executed with no error.
1011pub fn log_script(
1012    script: impl IntoParallelIterator<Item = (usize, Vec<FsCommand>)> + Send,
1013    mut out: impl Write + Send,
1014) -> io::Result<DedupeResult> {
1015    // Unfortunately the items may come in any order from the ParallelIterator,
1016    // and that order may change with each run, because multiple threads race to produce
1017    // the next item. However, we want to print the commands
1018    // in the same deterministic order as the groups in the input file.
1019    // That's why we first send the commands to a PriorityQueue through a channel in order to
1020    // get them all on a single thread. The queue sorts them by their sequential identifiers.
1021    // Because the identifiers are consecutive integers, we know which item should be printed next.
1022
1023    let (tx, rx) = channel();
1024
1025    // Scope needed so the script iterator doesn't need to be 'static.
1026    // This way we tell the compiler the background thread we start to process the iterator
1027    // terminates before exiting this function.
1028    crossbeam_utils::thread::scope(move |s| {
1029        // Process items in parallel in a background thread, so we can read as soon as they
1030        // are produced:
1031        s.spawn(move |_| {
1032            script
1033                .into_par_iter()
1034                .for_each_with(tx, |tx, item| tx.send(item).unwrap())
1035        });
1036
1037        let mut queue = PriorityQueue::new();
1038        let mut next_group_index = 0;
1039        let mut processed_count = 0;
1040        let mut reclaimed_space = FileLen(0);
1041
1042        while let Ok((group_index, commands)) = rx.recv() {
1043            // Push the command group we received from the iterator.
1044            // We may receive them in an incorrect order, so we push them to a PriorityQueue.
1045            queue.push(
1046                FsCommandGroup::new(group_index, commands),
1047                Reverse(group_index), // we want to get items with lowest-index first
1048            );
1049
1050            // Process items in the queue as soon as possible to save memory.
1051            while let Some((group, _)) = queue.peek() {
1052                // Only process the item if it is the next one we expect.
1053                // If we see an out-of-order item, we simply need to wait for more items
1054                // to be pushed.
1055                if group.index != next_group_index {
1056                    break;
1057                }
1058                // We got the right item, let's pop it from the queue and log it:
1059                next_group_index += 1;
1060                let cmd_vec = queue.pop().unwrap().0.commands;
1061                for cmd in cmd_vec {
1062                    processed_count += 1;
1063                    reclaimed_space += cmd.space_to_reclaim();
1064                    for line in cmd.to_shell_str() {
1065                        writeln!(out, "{line}")?;
1066                    }
1067                }
1068            }
1069        }
1070
1071        Ok(DedupeResult {
1072            processed_count,
1073            reclaimed_space,
1074        })
1075    })
1076    .unwrap()
1077}
1078
1079#[cfg(test)]
1080mod test {
1081    use std::collections::HashSet;
1082    use std::default::Default;
1083    use std::fs::{create_dir, create_dir_all};
1084    use std::path::PathBuf;
1085    use std::str::FromStr;
1086    use std::{thread, time};
1087
1088    use chrono::Duration;
1089    use itertools::Itertools;
1090
1091    use crate::config::GroupConfig;
1092    use crate::file::FileHash;
1093    use crate::group_files;
1094    use crate::log::StdLog;
1095    use crate::pattern::Pattern;
1096    use crate::util::test::{create_file, create_file_newer_than, read_file, with_dir, write_file};
1097
1098    use super::*;
1099
1100    #[test]
1101    fn test_temp_file_name_generation() {
1102        let path = Path::from("/foo/bar");
1103        let temp = FsCommand::temp_file(&path);
1104        assert_ne!(path, temp);
1105        assert_ne!(
1106            path.file_name().unwrap().len(),
1107            temp.file_name().unwrap().len()
1108        );
1109        assert_eq!(path.parent(), temp.parent());
1110    }
1111
1112    #[test]
1113    fn test_remove_command_removes_file() {
1114        with_dir("dedupe/remove_cmd", |root| {
1115            let log = StdLog::new();
1116            let file_path = root.join("file");
1117            create_file(&file_path);
1118            let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
1119            let cmd = FsCommand::Remove { file };
1120            cmd.execute(true, &log).unwrap();
1121            assert!(!file_path.exists())
1122        })
1123    }
1124
1125    #[test]
1126    fn test_move_command_moves_file_by_rename() {
1127        with_dir("dedupe/move_rename_cmd", |root| {
1128            let log = StdLog::new();
1129            let file_path = root.join("file");
1130            let target = Path::from(root.join("target"));
1131            create_file(&file_path);
1132            let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
1133            let cmd = FsCommand::Move {
1134                source: file,
1135                target: target.clone(),
1136                use_rename: true,
1137            };
1138            cmd.execute(true, &log).unwrap();
1139            assert!(!file_path.exists());
1140            assert!(target.to_path_buf().exists());
1141        })
1142    }
1143
1144    #[test]
1145    fn test_move_command_moves_file_by_copy() {
1146        with_dir("dedupe/move_copy_cmd", |root| {
1147            let log = StdLog::new();
1148            let file_path = root.join("file");
1149            let target = Path::from(root.join("target"));
1150            create_file(&file_path);
1151            let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
1152            let cmd = FsCommand::Move {
1153                source: file,
1154                target: target.clone(),
1155                use_rename: false,
1156            };
1157            cmd.execute(true, &log).unwrap();
1158            assert!(!file_path.exists());
1159            assert!(target.to_path_buf().exists());
1160        })
1161    }
1162
1163    #[test]
1164    fn test_move_fails_if_target_exists() {
1165        with_dir("dedupe/move_target_exists", |root| {
1166            let log = StdLog::new();
1167            let file_path = root.join("file");
1168            let target = root.join("target");
1169            create_file(&file_path);
1170            create_file(&target);
1171            let file = PathAndMetadata::new(Path::from(&file_path)).unwrap();
1172            let cmd = FsCommand::Move {
1173                source: file,
1174                target: Path::from(&target),
1175                use_rename: false,
1176            };
1177            assert!(cmd.execute(true, &log).is_err());
1178        })
1179    }
1180
1181    #[test]
1182    fn test_soft_link_command_replaces_file_with_a_link() {
1183        with_dir("dedupe/soft_link_cmd", |root| {
1184            let log = StdLog::new();
1185            let file_path_1 = root.join("file_1");
1186            let file_path_2 = root.join("file_2");
1187            write_file(&file_path_1, "foo");
1188            write_file(&file_path_2, "");
1189
1190            let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap();
1191            let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap();
1192            let cmd = FsCommand::SoftLink {
1193                target: Arc::new(file_1),
1194                link: file_2,
1195            };
1196            cmd.execute(true, &log).unwrap();
1197
1198            assert!(file_path_1.exists());
1199            assert!(file_path_2.exists());
1200            assert!(fs::symlink_metadata(&file_path_2)
1201                .unwrap()
1202                .file_type()
1203                .is_symlink());
1204            assert_eq!(read_file(&file_path_2), "foo");
1205        })
1206    }
1207
1208    #[test]
1209    fn test_hard_link_command_replaces_file_with_a_link() {
1210        with_dir("dedupe/hard_link_cmd", |root| {
1211            let log = StdLog::new();
1212            let file_path_1 = root.join("file_1");
1213            let file_path_2 = root.join("file_2");
1214
1215            write_file(&file_path_1, "foo");
1216            write_file(&file_path_2, "");
1217
1218            let file_1 = PathAndMetadata::new(Path::from(&file_path_1)).unwrap();
1219            let file_2 = PathAndMetadata::new(Path::from(&file_path_2)).unwrap();
1220            let cmd = FsCommand::HardLink {
1221                target: Arc::new(file_1),
1222                link: file_2,
1223            };
1224            cmd.execute(true, &log).unwrap();
1225
1226            assert!(file_path_1.exists());
1227            assert!(file_path_2.exists());
1228            assert_eq!(read_file(&file_path_2), "foo");
1229        })
1230    }
1231
1232    /// Creates 3 empty files with different creation time and returns a FileGroup describing them
1233    fn make_group(root: &PathBuf, file_hash: FileHash) -> FileGroup<Path> {
1234        create_dir_all(root).unwrap();
1235        let file_1 = root.join("file_1");
1236        let file_2 = root.join("file_2");
1237        let file_3 = root.join("file_3");
1238        create_file(&file_1);
1239        let ctime_1 = fs::metadata(&file_1).unwrap().modified().unwrap();
1240        let ctime_2 = create_file_newer_than(&file_2, ctime_1);
1241        create_file_newer_than(&file_3, ctime_2);
1242
1243        FileGroup {
1244            file_len: FileLen(0),
1245            file_hash,
1246            files: vec![
1247                Path::from(&file_1),
1248                Path::from(&file_2),
1249                Path::from(&file_3),
1250            ],
1251        }
1252    }
1253
1254    #[test]
1255    fn test_partition_selects_files_for_removal() {
1256        with_dir("dedupe/partition/basic", |root| {
1257            let group = make_group(root, FileHash::from_str("00").unwrap());
1258            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1259            let config = DedupeConfig::default();
1260            let partitioned = partition(group, &config, &StdLog::new()).unwrap();
1261            assert_eq!(partitioned.to_keep.len(), 1);
1262            assert_eq!(partitioned.to_drop.len(), 2);
1263        })
1264    }
1265
1266    #[test]
1267    fn test_partition_bails_out_if_file_modified_too_late() {
1268        with_dir("dedupe/partition/modification", |root| {
1269            let group = make_group(root, FileHash::from_str("00").unwrap());
1270            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1271            let config = DedupeConfig {
1272                modified_before: Some(DateTime::from(Local::now() - Duration::days(1))),
1273                ..DedupeConfig::default()
1274            };
1275            let partitioned = partition(group, &config, &StdLog::new());
1276            assert!(partitioned.is_err());
1277        })
1278    }
1279
1280    #[test]
1281    fn test_partition_skips_file_with_different_len() {
1282        with_dir("dedupe/partition/file_len", |root| {
1283            let group = make_group(root, FileHash::from_str("00").unwrap());
1284            let path = group.files[0].clone();
1285            write_file(&path.to_path_buf(), "foo");
1286            let config = DedupeConfig {
1287                priority: vec![Priority::MostRecentlyModified],
1288                ..DedupeConfig::default()
1289            };
1290            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1291            let partitioned = partition(group, &config, &StdLog::new()).unwrap();
1292            assert!(!partitioned.to_drop.iter().any(|m| m.path == path));
1293            assert!(!partitioned.to_keep.iter().any(|m| m.path == path));
1294        })
1295    }
1296
1297    fn path_set(v: &[PathAndMetadata]) -> HashSet<&Path> {
1298        v.iter().map(|f| &f.path).collect()
1299    }
1300
1301    #[test]
1302    fn test_partition_respects_creation_time_priority() {
1303        with_dir("dedupe/partition/ctime_priority", |root| {
1304            if fs::metadata(root).unwrap().created().is_err() {
1305                // can't run the test because the filesystem doesn't support fetching
1306                // file creation time
1307                return;
1308            }
1309            let group = make_group(root, FileHash::from_str("00").unwrap());
1310            let mut config = DedupeConfig {
1311                priority: vec![Priority::Newest],
1312                ..DedupeConfig::default()
1313            };
1314            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1315            let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap();
1316            config.priority = vec![Priority::Oldest];
1317            let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap();
1318
1319            assert_ne!(
1320                path_set(&partitioned_1.to_keep),
1321                path_set(&partitioned_2.to_keep)
1322            );
1323            assert_ne!(
1324                path_set(&partitioned_1.to_drop),
1325                path_set(&partitioned_2.to_drop)
1326            );
1327        });
1328    }
1329
1330    #[test]
1331    fn test_partition_respects_modification_time_priority() {
1332        with_dir("dedupe/partition/mtime_priority", |root| {
1333            let group = make_group(root, FileHash::from_str("00").unwrap());
1334
1335            thread::sleep(time::Duration::from_millis(10));
1336            let path = group.files[0].clone();
1337            write_file(&path.to_path_buf(), "foo");
1338
1339            // note that fetching metadata happens after we wrote the file
1340            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1341            let config = DedupeConfig {
1342                priority: vec![Priority::MostRecentlyModified],
1343                ..DedupeConfig::default()
1344            };
1345            let partitioned_1 = partition(group.clone(), &config, &StdLog::new()).unwrap();
1346
1347            let config = DedupeConfig {
1348                priority: vec![Priority::LeastRecentlyModified],
1349                ..DedupeConfig::default()
1350            };
1351            let partitioned_2 = partition(group, &config, &StdLog::new()).unwrap();
1352
1353            assert_ne!(
1354                path_set(&partitioned_1.to_keep),
1355                path_set(&partitioned_2.to_keep)
1356            );
1357            assert_ne!(
1358                path_set(&partitioned_1.to_drop),
1359                path_set(&partitioned_2.to_drop)
1360            );
1361        });
1362    }
1363
1364    #[test]
1365    fn test_partition_respects_keep_patterns() {
1366        with_dir("dedupe/partition/keep", |root| {
1367            let group = make_group(root, FileHash::from_str("00").unwrap());
1368            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1369            let mut config = DedupeConfig {
1370                priority: vec![Priority::LeastRecentlyModified],
1371                keep_name_patterns: vec![Pattern::glob("*_1").unwrap()],
1372                ..DedupeConfig::default()
1373            };
1374
1375            let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
1376            assert_eq!(p.to_keep.len(), 1);
1377            assert_eq!(&p.to_keep[0].path, &group.files[0].path);
1378
1379            config.keep_name_patterns = vec![];
1380            config.keep_path_patterns = vec![Pattern::glob("**/file_1").unwrap()];
1381            let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
1382            assert_eq!(p.to_keep.len(), 1);
1383            assert_eq!(&p.to_keep[0].path, &group.files[0].path);
1384        })
1385    }
1386
1387    #[test]
1388    fn test_partition_respects_drop_patterns() {
1389        with_dir("dedupe/partition/drop", |root| {
1390            let group = make_group(root, FileHash::from_str("00").unwrap());
1391            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1392            let mut config = DedupeConfig {
1393                priority: vec![Priority::LeastRecentlyModified],
1394                name_patterns: vec![Pattern::glob("*_3").unwrap()],
1395                ..DedupeConfig::default()
1396            };
1397            let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
1398            assert_eq!(p.to_drop.len(), 1);
1399            assert_eq!(&p.to_drop[0].path, &group.files[2].path);
1400
1401            config.name_patterns = vec![];
1402            config.path_patterns = vec![Pattern::glob("**/file_3").unwrap()];
1403            let p = partition(group.clone(), &config, &StdLog::new()).unwrap();
1404            assert_eq!(p.to_drop.len(), 1);
1405            assert_eq!(&p.to_drop[0].path, &group.files[2].path);
1406        })
1407    }
1408
1409    #[test]
1410    fn test_partition_respects_isolated_roots() {
1411        with_dir("dedupe/partition/isolated_roots", |root| {
1412            let root1 = root.join("root1");
1413            let root2 = root.join("root2");
1414            create_dir(&root1).unwrap();
1415            create_dir(&root2).unwrap();
1416
1417            let group1 = make_group(&root1, FileHash::from_str("00").unwrap());
1418            let group2 = make_group(&root2, FileHash::from_str("00").unwrap());
1419            let group = FileGroup {
1420                file_len: group1.file_len,
1421                file_hash: group1.file_hash,
1422                files: group1.files.into_iter().chain(group2.files).collect(),
1423            };
1424
1425            let config = DedupeConfig {
1426                isolated_roots: vec![Path::from(&root1), Path::from(&root2)],
1427                ..DedupeConfig::default()
1428            };
1429
1430            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1431            let p = partition(group, &config, &StdLog::new()).unwrap();
1432            assert_eq!(p.to_drop.len(), 3);
1433            assert!(p
1434                .to_drop
1435                .iter()
1436                .all(|f| f.path.to_path_buf().starts_with(&root2)));
1437            assert_eq!(p.to_keep.len(), 3);
1438            assert!(p
1439                .to_keep
1440                .iter()
1441                .all(|f| f.path.to_path_buf().starts_with(&root1)));
1442        })
1443    }
1444
1445    #[test]
1446    fn test_partition_respects_links() {
1447        with_dir("dedupe/partition/links", |root| {
1448            let root_a = root.join("root_a");
1449            let root_b = root.join("root_b");
1450            create_dir(&root_a).unwrap();
1451            create_dir(&root_b).unwrap();
1452
1453            let file_a1 = root_a.join("file_a1");
1454            let file_a2 = root_a.join("file_a2");
1455            write_file(&file_a1, "aaa");
1456            fs::hard_link(&file_a1, &file_a2).unwrap();
1457
1458            let file_b1 = root_b.join("file_b1");
1459            let file_b2 = root_b.join("file_b2");
1460            write_file(&file_b1, "aaa");
1461            fs::hard_link(&file_b1, &file_b2).unwrap();
1462
1463            let group = FileGroup {
1464                file_len: FileLen(3),
1465                file_hash: FileHash::from_str("00").unwrap(),
1466                files: vec![
1467                    Path::from(&file_b1),
1468                    Path::from(&file_a2),
1469                    Path::from(&file_a1),
1470                    Path::from(&file_b2),
1471                ],
1472            };
1473
1474            let config = DedupeConfig::default();
1475            let group = group.map(|p| PathAndMetadata::new(p).unwrap());
1476            let p = partition(group, &config, &StdLog::new()).unwrap();
1477
1478            // drop A files because file_a2 appears after file_b1 in the files vector
1479            assert_eq!(p.to_drop.len(), 2);
1480            assert!(p
1481                .to_drop
1482                .iter()
1483                .all(|f| f.path.to_path_buf().starts_with(&root_a)));
1484            assert_eq!(p.to_keep.len(), 2);
1485            assert!(p
1486                .to_keep
1487                .iter()
1488                .all(|f| f.path.to_path_buf().starts_with(&root_b)));
1489        })
1490    }
1491
1492    #[test]
1493    fn test_run_dedupe_script() {
1494        with_dir("dedupe/partition/run_dedupe_script", |root| {
1495            let mut log = StdLog::new();
1496            log.no_progress = true;
1497            log.log_stderr_to_stdout = true;
1498
1499            let group = make_group(root, FileHash::from_str("00").unwrap());
1500            let config = DedupeConfig {
1501                priority: vec![Priority::LeastRecentlyModified],
1502                ..DedupeConfig::default()
1503            };
1504            let script = dedupe(vec![group], DedupeOp::Remove, &config, &log);
1505            let dedupe_result = run_script(script, !config.no_lock, &log);
1506            assert_eq!(dedupe_result.processed_count, 2);
1507            assert!(!root.join("file_1").exists());
1508            assert!(!root.join("file_2").exists());
1509            assert!(root.join("file_3").exists());
1510        });
1511    }
1512
1513    #[test]
1514    fn test_log_dedupe_script() {
1515        with_dir("dedupe/partition/log_dedupe_script", |root| {
1516            let mut log = StdLog::new();
1517            log.no_progress = true;
1518            log.log_stderr_to_stdout = true;
1519
1520            let group_1 = make_group(&root.join("group_1"), FileHash::from_str("00").unwrap());
1521            let group_2 = make_group(&root.join("group_2"), FileHash::from_str("01").unwrap());
1522            let group_3 = make_group(&root.join("group_3"), FileHash::from_str("02").unwrap());
1523            let groups = vec![group_1, group_2, group_3];
1524
1525            let config = DedupeConfig {
1526                priority: vec![Priority::LeastRecentlyModified],
1527                ..DedupeConfig::default()
1528            };
1529
1530            let script = dedupe(groups, DedupeOp::Remove, &config, &log);
1531
1532            let mut out = Vec::new();
1533            let dedupe_result = log_script(script, &mut out).unwrap();
1534            assert_eq!(dedupe_result.processed_count, 6);
1535
1536            let out = String::from_utf8(out).unwrap();
1537            let out_lines = out.lines().collect_vec();
1538            assert_eq!(out_lines.len(), 6);
1539            assert!(out_lines[0].contains("group_1"));
1540            assert!(out_lines[1].contains("group_1"));
1541            assert!(out_lines[2].contains("group_2"));
1542            assert!(out_lines[3].contains("group_2"));
1543            assert!(out_lines[4].contains("group_3"));
1544            assert!(out_lines[5].contains("group_3"));
1545        });
1546    }
1547
1548    #[test]
1549    fn test_hard_link_merges_subgroups_of_hard_links() {
1550        with_dir("dedupe/merge_subgroups_of_hardlinks", |root| {
1551            let mut log = StdLog::new();
1552            log.no_progress = true;
1553            log.log_stderr_to_stdout = true;
1554
1555            let file_a1 = root.join("file_a1");
1556            let file_a2 = root.join("file_a2");
1557            let file_b1 = root.join("file_b1");
1558            let file_b2 = root.join("file_b2");
1559
1560            write_file(&file_a1, "foo");
1561            write_file(&file_b1, "foo");
1562
1563            let file_id = FileId::new(&Path::from(&file_a1)).unwrap();
1564
1565            fs::hard_link(&file_a1, &file_a2).unwrap();
1566            fs::hard_link(&file_b1, &file_b2).unwrap();
1567
1568            let group_config = GroupConfig {
1569                paths: vec![Path::from(root)],
1570                ..GroupConfig::default()
1571            };
1572
1573            let groups = group_files(&group_config, &log).unwrap();
1574            let dedupe_config = DedupeConfig::default();
1575            let script = dedupe(groups, DedupeOp::HardLink, &dedupe_config, &log);
1576            let dedupe_result = run_script(script, false, &log);
1577            assert_eq!(dedupe_result.processed_count, 2);
1578            assert!(file_a1.exists());
1579            assert!(file_a2.exists());
1580            assert!(file_b1.exists());
1581            assert!(file_b2.exists());
1582
1583            assert_eq!(read_file(&file_a1), "foo");
1584
1585            assert_eq!(FileId::new(&Path::from(&file_a2)).unwrap(), file_id);
1586            assert_eq!(FileId::new(&Path::from(&file_b1)).unwrap(), file_id);
1587            assert_eq!(FileId::new(&Path::from(&file_b2)).unwrap(), file_id);
1588        })
1589    }
1590
1591    #[test]
1592    #[cfg(unix)]
1593    fn test_remove_removes_subgroups_of_soft_links() {
1594        use std::os::unix::fs;
1595
1596        with_dir("dedupe/remove_subgroups_with_symlinks", |root| {
1597            let mut log = StdLog::new();
1598            log.no_progress = true;
1599            log.log_stderr_to_stdout = true;
1600
1601            let file_a1 = root.join("file_a1");
1602            let file_a2 = root.join("file_a2");
1603            let file_b1 = root.join("file_b1");
1604            let file_b2 = root.join("file_b2");
1605
1606            write_file(&file_a1, "foo");
1607            write_file(&file_b1, "foo");
1608
1609            fs::symlink(&file_a1, &file_a2).unwrap();
1610            fs::symlink(&file_b1, &file_b2).unwrap();
1611
1612            let group_config = GroupConfig {
1613                paths: vec![Path::from(root)],
1614                symbolic_links: true,
1615                ..GroupConfig::default()
1616            };
1617
1618            let groups = group_files(&group_config, &log).unwrap();
1619            let dedupe_config = DedupeConfig::default();
1620            let script = dedupe(groups, DedupeOp::Remove, &dedupe_config, &log);
1621            let dedupe_result = run_script(script, false, &log);
1622            assert_eq!(dedupe_result.processed_count, 2);
1623
1624            assert!(file_a1.exists());
1625            assert!(file_a2.exists());
1626            assert!(!file_b1.exists());
1627            assert!(!file_b2.exists());
1628
1629            assert_eq!(read_file(&file_a1), "foo");
1630            assert_eq!(read_file(&file_a2), "foo");
1631        })
1632    }
1633}