use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use rayon::prelude::*;
use crate::cache::{CacheEntry, HashCache};
use crate::scanner::{FileEntry, Hash, Hasher};
#[derive(Clone)]
pub struct PrehashConfig {
pub io_threads: usize,
pub cache: Option<Arc<HashCache>>,
pub shutdown_flag: Option<Arc<AtomicBool>>,
pub progress_callback: Option<Arc<dyn ProgressCallback>>,
pub reference_paths: Vec<PathBuf>,
}
impl std::fmt::Debug for PrehashConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("PrehashConfig")
.field("io_threads", &self.io_threads)
.field("cache", &self.cache.as_ref().map(|_| "<cache>"))
.field("shutdown_flag", &self.shutdown_flag)
.field(
"progress_callback",
&self.progress_callback.as_ref().map(|_| "<callback>"),
)
.field("reference_paths", &self.reference_paths)
.finish()
}
}
impl Default for PrehashConfig {
fn default() -> Self {
Self {
io_threads: 4,
cache: None,
shutdown_flag: None,
progress_callback: None,
reference_paths: Vec::new(),
}
}
}
impl PrehashConfig {
#[must_use]
pub fn with_io_threads(mut self, threads: usize) -> Self {
self.io_threads = threads.max(1);
self
}
#[must_use]
pub fn with_cache(mut self, cache: Arc<HashCache>) -> Self {
self.cache = Some(cache);
self
}
#[must_use]
pub fn with_shutdown_flag(mut self, flag: Arc<AtomicBool>) -> Self {
self.shutdown_flag = Some(flag);
self
}
#[must_use]
pub fn with_progress_callback(mut self, callback: Arc<dyn ProgressCallback>) -> Self {
self.progress_callback = Some(callback);
self
}
#[must_use]
pub fn with_reference_paths(mut self, paths: Vec<PathBuf>) -> Self {
self.reference_paths = paths;
self
}
fn is_shutdown_requested(&self) -> bool {
self.shutdown_flag
.as_ref()
.is_some_and(|f| f.load(Ordering::SeqCst))
}
}
pub trait ProgressCallback: Send + Sync {
fn on_phase_start(&self, phase: &str, total: usize);
fn on_progress(&self, current: usize, path: &str);
fn on_item_completed(&self, _bytes: u64) {}
fn on_phase_end(&self, phase: &str);
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct PrehashStats {
pub input_files: usize,
pub hashed_files: usize,
pub failed_files: usize,
pub cache_hits: usize,
pub cache_misses: usize,
pub unique_prehashes: usize,
pub potential_duplicates: usize,
pub duplicate_groups: usize,
pub interrupted: bool,
}
impl PrehashStats {
#[must_use]
pub fn elimination_rate(&self) -> f64 {
if self.input_files == 0 {
0.0
} else {
let eliminated = self.input_files - self.potential_duplicates;
(eliminated as f64 / self.input_files as f64) * 100.0
}
}
}
#[derive(Debug, Clone)]
pub struct PrehashEntry {
pub file: FileEntry,
pub prehash: Hash,
}
#[must_use]
pub fn phase2_prehash(
size_groups: HashMap<u64, Vec<FileEntry>>,
hasher: Arc<Hasher>,
config: PrehashConfig,
) -> (HashMap<Hash, Vec<FileEntry>>, PrehashStats) {
let input_files: usize = size_groups.values().map(|v| v.len()).sum();
let mut stats = PrehashStats {
input_files,
..Default::default()
};
let all_files: Vec<FileEntry> = size_groups.into_values().flatten().collect();
if all_files.is_empty() {
log::debug!("Phase 2: No files to process");
return (HashMap::new(), stats);
}
if let Some(ref callback) = config.progress_callback {
callback.on_phase_start("prehash", all_files.len());
}
log::info!("Phase 2: Computing prehashes for {} files", all_files.len());
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(config.io_threads)
.build()
.unwrap_or_else(|_| {
log::warn!(
"Failed to create custom thread pool, using global pool with {} threads",
rayon::current_num_threads()
);
rayon::ThreadPoolBuilder::new().build().unwrap()
});
let prehash_results: Vec<(FileEntry, Option<Hash>, bool)> = pool.install(|| {
all_files
.into_par_iter()
.enumerate()
.map(|(idx, file)| {
if config.is_shutdown_requested() {
log::debug!("Phase 2: Shutdown requested, skipping remaining files");
return (file, None, false);
}
if let Some(ref callback) = config.progress_callback {
callback.on_progress(idx + 1, file.path.to_string_lossy().as_ref());
}
if let Some(ref cache) = config.cache {
match cache.get_prehash(&file.path, file.size, file.modified) {
Ok(Some(hash)) => {
log::trace!("Prehash cache hit: {}", file.path.display());
return (file, Some(hash), true);
}
Ok(None) => {
log::trace!("Prehash cache miss: {}", file.path.display());
}
Err(e) => {
log::warn!("Failed to query cache for {}: {}", file.path.display(), e);
}
}
}
match hasher.prehash(&file.path) {
Ok(hash) => {
log::trace!("Prehash computed: {}", file.path.display());
if let Some(ref cache) = config.cache {
let entry = CacheEntry::from(file.clone());
if let Err(e) = cache.insert_prehash(&entry, hash) {
log::warn!(
"Failed to update cache for {}: {}",
file.path.display(),
e
);
}
}
(file, Some(hash), false)
}
Err(e) => {
log::warn!("Failed to prehash {}: {}", file.path.display(), e);
(file, None, false)
}
}
})
.collect()
});
if config.is_shutdown_requested() {
stats.interrupted = true;
log::info!("Phase 2: Interrupted by shutdown signal");
}
let mut prehash_groups: HashMap<Hash, Vec<FileEntry>> = HashMap::new();
for (file, prehash_opt, is_hit) in prehash_results {
match prehash_opt {
Some(prehash) => {
stats.hashed_files += 1;
if is_hit {
stats.cache_hits += 1;
} else {
stats.cache_misses += 1;
}
prehash_groups.entry(prehash).or_default().push(file);
}
None => {
stats.failed_files += 1;
}
}
}
let filtered_groups: HashMap<Hash, Vec<FileEntry>> = prehash_groups
.into_iter()
.filter(|(hash, files)| {
if files.len() == 1 {
stats.unique_prehashes += 1;
log::trace!(
"Eliminated unique prehash {}: {}",
crate::scanner::hash_to_hex(hash),
files[0].path.display()
);
false
} else {
stats.potential_duplicates += files.len();
stats.duplicate_groups += 1;
log::debug!(
"Prehash group {}: {} potential duplicates",
crate::scanner::hash_to_hex(hash),
files.len()
);
true
}
})
.collect();
if let Some(ref callback) = config.progress_callback {
callback.on_phase_end("prehash");
}
log::info!(
"Phase 2 complete: {} files → {} potential duplicates ({:.1}% eliminated)",
stats.input_files,
stats.potential_duplicates,
stats.elimination_rate()
);
(filtered_groups, stats)
}
#[must_use]
pub fn compute_prehashes(
size_groups: HashMap<u64, Vec<FileEntry>>,
hasher: Arc<Hasher>,
config: PrehashConfig,
) -> Vec<PrehashEntry> {
let all_files: Vec<FileEntry> = size_groups.into_values().flatten().collect();
if all_files.is_empty() {
return Vec::new();
}
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(config.io_threads)
.build()
.unwrap_or_else(|_| rayon::ThreadPoolBuilder::new().build().unwrap());
pool.install(|| {
all_files
.into_par_iter()
.filter_map(|file| {
if config.is_shutdown_requested() {
return None;
}
if let Some(ref cache) = config.cache {
if let Ok(Some(prehash)) =
cache.get_prehash(&file.path, file.size, file.modified)
{
return Some(PrehashEntry { file, prehash });
}
}
match hasher.prehash(&file.path) {
Ok(prehash) => {
if let Some(ref cache) = config.cache {
let entry = CacheEntry::from(file.clone());
let _ = cache.insert_prehash(&entry, prehash);
}
Some(PrehashEntry { file, prehash })
}
Err(e) => {
log::warn!("Failed to prehash {}: {}", file.path.display(), e);
None
}
}
})
.collect()
})
}
#[must_use]
pub fn extract_paths(files: &[FileEntry]) -> Vec<PathBuf> {
files.iter().map(|f| f.path.clone()).collect()
}
const LARGE_FILE_THRESHOLD: u64 = 100 * 1024 * 1024;
#[derive(Clone)]
pub struct FullhashConfig {
pub io_threads: usize,
pub cache: Option<Arc<HashCache>>,
pub shutdown_flag: Option<Arc<AtomicBool>>,
pub progress_callback: Option<Arc<dyn ProgressCallback>>,
pub reference_paths: Vec<PathBuf>,
}
impl std::fmt::Debug for FullhashConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FullhashConfig")
.field("io_threads", &self.io_threads)
.field("cache", &self.cache.as_ref().map(|_| "<cache>"))
.field("shutdown_flag", &self.shutdown_flag)
.field(
"progress_callback",
&self.progress_callback.as_ref().map(|_| "<callback>"),
)
.field("reference_paths", &self.reference_paths)
.finish()
}
}
impl Default for FullhashConfig {
fn default() -> Self {
Self {
io_threads: 4,
cache: None,
shutdown_flag: None,
progress_callback: None,
reference_paths: Vec::new(),
}
}
}
impl FullhashConfig {
#[must_use]
pub fn with_io_threads(mut self, threads: usize) -> Self {
self.io_threads = threads.max(1);
self
}
#[must_use]
pub fn with_cache(mut self, cache: Arc<HashCache>) -> Self {
self.cache = Some(cache);
self
}
#[must_use]
pub fn with_shutdown_flag(mut self, flag: Arc<AtomicBool>) -> Self {
self.shutdown_flag = Some(flag);
self
}
#[must_use]
pub fn with_progress_callback(mut self, callback: Arc<dyn ProgressCallback>) -> Self {
self.progress_callback = Some(callback);
self
}
#[must_use]
pub fn with_reference_paths(mut self, paths: Vec<PathBuf>) -> Self {
self.reference_paths = paths;
self
}
fn is_shutdown_requested(&self) -> bool {
self.shutdown_flag
.as_ref()
.is_some_and(|f| f.load(Ordering::SeqCst))
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct FullhashStats {
pub input_files: usize,
pub hashed_files: usize,
pub failed_files: usize,
pub cache_hits: usize,
pub cache_misses: usize,
pub bytes_hashed: u64,
pub duplicate_groups: usize,
pub duplicate_files: usize,
pub wasted_space: u64,
pub interrupted: bool,
}
impl FullhashStats {
pub fn calculate_wasted_space(&mut self, groups: &[super::DuplicateGroup]) {
self.duplicate_groups = groups.len();
self.duplicate_files = groups.iter().map(|g| g.duplicate_count()).sum();
self.wasted_space = groups.iter().map(|g| g.wasted_space()).sum();
}
}
#[must_use]
pub fn phase3_fullhash(
prehash_groups: HashMap<Hash, Vec<FileEntry>>,
hasher: Arc<Hasher>,
config: FullhashConfig,
) -> (Vec<super::DuplicateGroup>, FullhashStats) {
let input_files: usize = prehash_groups.values().map(|v| v.len()).sum();
let mut stats = FullhashStats {
input_files,
..Default::default()
};
let all_files: Vec<(FileEntry, Hash)> = prehash_groups
.into_iter()
.flat_map(|(hash, files)| files.into_iter().map(move |f| (f, hash)))
.collect();
if all_files.is_empty() {
log::debug!("Phase 3: No files to process");
return (Vec::new(), stats);
}
if let Some(ref callback) = config.progress_callback {
callback.on_phase_start("fullhash", all_files.len());
}
log::info!(
"Phase 3: Computing full hashes for {} files",
all_files.len()
);
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(config.io_threads)
.build()
.unwrap_or_else(|_| {
log::warn!(
"Failed to create custom thread pool, using global pool with {} threads",
rayon::current_num_threads()
);
rayon::ThreadPoolBuilder::new().build().unwrap()
});
let hash_results: Vec<(FileEntry, Option<Hash>, bool)> = pool.install(|| {
all_files
.into_par_iter()
.enumerate()
.map(|(idx, (file, prehash))| {
if config.is_shutdown_requested() {
log::debug!("Phase 3: Shutdown requested, skipping remaining files");
return (file, None, false);
}
if file.size > LARGE_FILE_THRESHOLD {
log::debug!(
"Hashing large file ({} MB): {}",
file.size / (1024 * 1024),
file.path.display()
);
}
if let Some(ref callback) = config.progress_callback {
callback.on_progress(idx + 1, file.path.to_string_lossy().as_ref());
}
if let Some(ref cache) = config.cache {
match cache.get_fullhash(&file.path, file.size, file.modified) {
Ok(Some(hash)) => {
log::trace!("Full hash cache hit: {}", file.path.display());
return (file, Some(hash), true);
}
Ok(None) => {
log::trace!("Full hash cache miss: {}", file.path.display());
}
Err(e) => {
log::warn!("Failed to query cache for {}: {}", file.path.display(), e);
}
}
}
match hasher.full_hash(&file.path) {
Ok(hash) => {
log::trace!("Full hash computed: {}", file.path.display());
if let Some(ref callback) = config.progress_callback {
callback.on_item_completed(file.size);
}
if let Some(ref cache) = config.cache {
let mut entry = CacheEntry::from(file.clone());
entry.prehash = prehash;
if let Err(e) = cache.insert_fullhash(&entry, hash) {
log::warn!(
"Failed to update cache for {}: {}",
file.path.display(),
e
);
}
}
(file, Some(hash), false)
}
Err(e) => {
log::warn!("Failed to hash {}: {}", file.path.display(), e);
(file, None, false)
}
}
})
.collect()
});
if config.is_shutdown_requested() {
stats.interrupted = true;
log::info!("Phase 3: Interrupted by shutdown signal");
}
let mut fullhash_groups: HashMap<Hash, Vec<FileEntry>> = HashMap::new();
for (file, fullhash_opt, is_hit) in hash_results {
match fullhash_opt {
Some(fullhash) => {
stats.hashed_files += 1;
stats.bytes_hashed += file.size;
if is_hit {
stats.cache_hits += 1;
} else {
stats.cache_misses += 1;
}
fullhash_groups.entry(fullhash).or_default().push(file);
}
None => {
stats.failed_files += 1;
}
}
}
let duplicate_groups: Vec<super::DuplicateGroup> = fullhash_groups
.into_iter()
.filter(|(_, files)| files.len() > 1)
.map(|(hash, files)| {
let size = files.first().map_or(0, |f| f.size);
log::debug!(
"Duplicate group {}: {} files, {} bytes each",
crate::scanner::hash_to_hex(&hash),
files.len(),
size
);
super::DuplicateGroup::new(hash, size, files, config.reference_paths.clone())
})
.collect();
stats.calculate_wasted_space(&duplicate_groups);
if let Some(ref callback) = config.progress_callback {
callback.on_phase_end("fullhash");
}
log::info!(
"Phase 3 complete: {} groups, {} duplicates, {} bytes reclaimable",
stats.duplicate_groups,
stats.duplicate_files,
stats.wasted_space
);
(duplicate_groups, stats)
}
#[derive(Clone)]
pub struct FinderConfig {
pub io_threads: usize,
pub cache: Option<Arc<HashCache>>,
pub paranoid: bool,
pub walker_config: crate::scanner::WalkerConfig,
pub shutdown_flag: Option<Arc<AtomicBool>>,
pub progress_callback: Option<Arc<dyn ProgressCallback>>,
pub reference_paths: Vec<PathBuf>,
}
impl std::fmt::Debug for FinderConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FinderConfig")
.field("io_threads", &self.io_threads)
.field("cache", &self.cache.as_ref().map(|_| "<cache>"))
.field("paranoid", &self.paranoid)
.field("walker_config", &self.walker_config)
.field("shutdown_flag", &self.shutdown_flag)
.field(
"progress_callback",
&self.progress_callback.as_ref().map(|_| "<callback>"),
)
.field("reference_paths", &self.reference_paths)
.finish()
}
}
impl Default for FinderConfig {
fn default() -> Self {
Self {
io_threads: 4,
cache: None,
paranoid: false,
walker_config: crate::scanner::WalkerConfig::default(),
shutdown_flag: None,
progress_callback: None,
reference_paths: Vec::new(),
}
}
}
impl FinderConfig {
#[must_use]
pub fn with_io_threads(mut self, threads: usize) -> Self {
self.io_threads = threads.max(1);
self
}
#[must_use]
pub fn with_cache(mut self, cache: Arc<HashCache>) -> Self {
self.cache = Some(cache);
self
}
#[must_use]
pub fn with_paranoid(mut self, enabled: bool) -> Self {
self.paranoid = enabled;
self
}
#[must_use]
pub fn with_walker_config(mut self, config: crate::scanner::WalkerConfig) -> Self {
self.walker_config = config;
self
}
#[must_use]
pub fn with_shutdown_flag(mut self, flag: Arc<AtomicBool>) -> Self {
self.shutdown_flag = Some(flag);
self
}
#[must_use]
pub fn with_progress_callback(mut self, callback: Arc<dyn ProgressCallback>) -> Self {
self.progress_callback = Some(callback);
self
}
#[must_use]
pub fn with_reference_paths(mut self, paths: Vec<PathBuf>) -> Self {
self.reference_paths = paths;
self
}
fn is_shutdown_requested(&self) -> bool {
self.shutdown_flag
.as_ref()
.is_some_and(|f| f.load(Ordering::SeqCst))
}
}
#[derive(Debug, Clone, Default)]
pub struct ScanSummary {
pub total_files: usize,
pub total_size: u64,
pub eliminated_by_size: usize,
pub eliminated_by_prehash: usize,
pub cache_prehash_hits: usize,
pub cache_prehash_misses: usize,
pub cache_fullhash_hits: usize,
pub cache_fullhash_misses: usize,
pub duplicate_groups: usize,
pub duplicate_files: usize,
pub reclaimable_space: u64,
pub scan_duration: std::time::Duration,
pub interrupted: bool,
}
impl ScanSummary {
#[must_use]
pub fn wasted_percentage(&self) -> f64 {
if self.total_size == 0 {
0.0
} else {
(self.reclaimable_space as f64 / self.total_size as f64) * 100.0
}
}
#[must_use]
pub fn reclaimable_display(&self) -> String {
format_size(self.reclaimable_space)
}
#[must_use]
pub fn total_size_display(&self) -> String {
format_size(self.total_size)
}
}
fn format_size(bytes: u64) -> String {
const KB: u64 = 1024;
const MB: u64 = KB * 1024;
const GB: u64 = MB * 1024;
const TB: u64 = GB * 1024;
if bytes >= TB {
format!("{:.2} TB", bytes as f64 / TB as f64)
} else if bytes >= GB {
format!("{:.2} GB", bytes as f64 / GB as f64)
} else if bytes >= MB {
format!("{:.2} MB", bytes as f64 / MB as f64)
} else if bytes >= KB {
format!("{:.2} KB", bytes as f64 / KB as f64)
} else {
format!("{} B", bytes)
}
}
#[derive(thiserror::Error, Debug)]
pub enum FinderError {
#[error("Scan interrupted by user")]
Interrupted,
#[error("Path not found: {0}")]
PathNotFound(PathBuf),
#[error("Not a directory: {0}")]
NotADirectory(PathBuf),
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
}
pub struct DuplicateFinder {
config: FinderConfig,
hasher: Arc<Hasher>,
}
impl DuplicateFinder {
#[must_use]
pub fn new(config: FinderConfig) -> Self {
let mut hasher = Hasher::new();
if let Some(ref flag) = config.shutdown_flag {
hasher = hasher.with_shutdown_flag(flag.clone());
}
Self {
config,
hasher: Arc::new(hasher),
}
}
#[must_use]
pub fn with_defaults() -> Self {
Self::new(FinderConfig::default())
}
pub fn find_duplicates(
&self,
path: &std::path::Path,
) -> Result<(Vec<super::DuplicateGroup>, ScanSummary), FinderError> {
let start_time = std::time::Instant::now();
let mut summary = ScanSummary::default();
if !path.exists() {
return Err(FinderError::PathNotFound(path.to_path_buf()));
}
if !path.is_dir() {
return Err(FinderError::NotADirectory(path.to_path_buf()));
}
log::info!("Starting duplicate scan of {}", path.display());
if self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
log::info!("Phase 0: Walking directory...");
if let Some(ref callback) = self.config.progress_callback {
callback.on_phase_start("walking", 0); }
let walker = crate::scanner::Walker::new(path, self.config.walker_config.clone());
let walker = if let Some(ref flag) = self.config.shutdown_flag {
walker.with_shutdown_flag(flag.clone())
} else {
walker
};
let files: Vec<FileEntry> = walker.walk().filter_map(|r| r.ok()).collect();
if let Some(ref callback) = self.config.progress_callback {
callback.on_phase_end("walking");
}
summary.total_files = files.len();
summary.total_size = files.iter().map(|f| f.size).sum();
log::info!(
"Found {} files ({} total)",
files.len(),
format_size(summary.total_size)
);
if self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if files.is_empty() {
log::info!("No files found, scan complete");
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
log::info!("Phase 1: Grouping by size...");
let (size_groups, size_stats) = super::group_by_size(files);
summary.eliminated_by_size = size_stats.eliminated_unique;
log::info!(
"Phase 1 complete: {} → {} files ({:.1}% eliminated)",
size_stats.total_files,
size_stats.potential_duplicates,
size_stats.elimination_rate()
);
if self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if size_groups.is_empty() {
log::info!("No potential duplicates found after size grouping");
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
log::info!("Phase 2: Computing prehashes...");
let prehash_config = PrehashConfig {
io_threads: self.config.io_threads,
cache: self.config.cache.clone(),
shutdown_flag: self.config.shutdown_flag.clone(),
progress_callback: self.config.progress_callback.clone(),
reference_paths: self.config.reference_paths.clone(),
};
let (prehash_groups, prehash_stats) =
phase2_prehash(size_groups, self.hasher.clone(), prehash_config);
summary.eliminated_by_prehash = prehash_stats.unique_prehashes;
summary.cache_prehash_hits = prehash_stats.cache_hits;
summary.cache_prehash_misses = prehash_stats.cache_misses;
if prehash_stats.interrupted || self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if prehash_groups.is_empty() {
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
let fullhash_config = FullhashConfig {
io_threads: self.config.io_threads,
cache: self.config.cache.clone(),
shutdown_flag: self.config.shutdown_flag.clone(),
progress_callback: self.config.progress_callback.clone(),
reference_paths: self.config.reference_paths.clone(),
};
let (duplicate_groups, fullhash_stats) =
phase3_fullhash(prehash_groups, self.hasher.clone(), fullhash_config);
if fullhash_stats.interrupted || self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
summary.duplicate_groups = fullhash_stats.duplicate_groups;
summary.duplicate_files = fullhash_stats.duplicate_files;
summary.reclaimable_space = fullhash_stats.wasted_space;
summary.cache_fullhash_hits = fullhash_stats.cache_hits;
summary.cache_fullhash_misses = fullhash_stats.cache_misses;
summary.scan_duration = start_time.elapsed();
log::info!(
"Scan complete: {} duplicate groups, {} duplicate files, {} reclaimable, {} cache hits",
summary.duplicate_groups,
summary.duplicate_files,
summary.reclaimable_display(),
summary.cache_prehash_hits + summary.cache_fullhash_hits
);
Ok((duplicate_groups, summary))
}
pub fn find_duplicates_from_files(
&self,
files: Vec<FileEntry>,
) -> Result<(Vec<super::DuplicateGroup>, ScanSummary), FinderError> {
let start_time = std::time::Instant::now();
let total_files = files.len();
let total_size: u64 = files.iter().map(|f| f.size).sum();
let mut summary = ScanSummary {
total_files,
total_size,
..Default::default()
};
log::info!(
"Processing {} files ({})",
files.len(),
format_size(summary.total_size)
);
if self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if files.is_empty() {
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
let (size_groups, size_stats) = super::group_by_size(files);
summary.eliminated_by_size = size_stats.eliminated_unique;
if self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if size_groups.is_empty() {
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
let prehash_config = PrehashConfig {
io_threads: self.config.io_threads,
cache: self.config.cache.clone(),
shutdown_flag: self.config.shutdown_flag.clone(),
progress_callback: self.config.progress_callback.clone(),
reference_paths: self.config.reference_paths.clone(),
};
let (prehash_groups, prehash_stats) =
phase2_prehash(size_groups, self.hasher.clone(), prehash_config);
summary.eliminated_by_prehash = prehash_stats.unique_prehashes;
summary.cache_prehash_hits = prehash_stats.cache_hits;
summary.cache_prehash_misses = prehash_stats.cache_misses;
if prehash_stats.interrupted || self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
if prehash_groups.is_empty() {
summary.scan_duration = start_time.elapsed();
return Ok((Vec::new(), summary));
}
let fullhash_config = FullhashConfig {
io_threads: self.config.io_threads,
cache: self.config.cache.clone(),
shutdown_flag: self.config.shutdown_flag.clone(),
progress_callback: self.config.progress_callback.clone(),
reference_paths: self.config.reference_paths.clone(),
};
let (duplicate_groups, fullhash_stats) =
phase3_fullhash(prehash_groups, self.hasher.clone(), fullhash_config);
if fullhash_stats.interrupted || self.config.is_shutdown_requested() {
return Err(FinderError::Interrupted);
}
summary.duplicate_groups = fullhash_stats.duplicate_groups;
summary.duplicate_files = fullhash_stats.duplicate_files;
summary.reclaimable_space = fullhash_stats.wasted_space;
summary.cache_fullhash_hits = fullhash_stats.cache_hits;
summary.cache_fullhash_misses = fullhash_stats.cache_misses;
summary.scan_duration = start_time.elapsed();
Ok((duplicate_groups, summary))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Write;
use std::time::SystemTime;
use tempfile::TempDir;
fn make_file_entry(path: &str, size: u64) -> FileEntry {
FileEntry::new(std::path::PathBuf::from(path), size, SystemTime::now())
}
fn create_test_file(dir: &TempDir, name: &str, content: &[u8]) -> FileEntry {
let path = dir.path().join(name);
let mut f = File::create(&path).unwrap();
f.write_all(content).unwrap();
FileEntry::new(path, content.len() as u64, SystemTime::now())
}
#[test]
fn test_prehash_config_default() {
let config = PrehashConfig::default();
assert_eq!(config.io_threads, 4);
assert!(config.shutdown_flag.is_none());
assert!(config.progress_callback.is_none());
}
#[test]
fn test_prehash_config_builder() {
let shutdown = Arc::new(AtomicBool::new(false));
let config = PrehashConfig::default()
.with_io_threads(8)
.with_shutdown_flag(shutdown.clone());
assert_eq!(config.io_threads, 8);
assert!(config.shutdown_flag.is_some());
}
#[test]
fn test_prehash_stats_default() {
let stats = PrehashStats::default();
assert_eq!(stats.input_files, 0);
assert_eq!(stats.hashed_files, 0);
assert_eq!(stats.elimination_rate(), 0.0);
}
#[test]
fn test_prehash_stats_elimination_rate() {
let stats = PrehashStats {
input_files: 100,
hashed_files: 100,
failed_files: 0,
cache_hits: 0,
cache_misses: 100,
unique_prehashes: 80,
potential_duplicates: 20,
duplicate_groups: 5,
interrupted: false,
};
assert!((stats.elimination_rate() - 80.0).abs() < 0.1);
}
#[test]
fn test_phase2_empty_input() {
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(HashMap::new(), hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 0);
assert_eq!(stats.potential_duplicates, 0);
}
#[test]
fn test_phase2_identical_files() {
let dir = TempDir::new().unwrap();
let content = b"identical content for testing";
let file1 = create_test_file(&dir, "file1.txt", content);
let file2 = create_test_file(&dir, "file2.txt", content);
let mut size_groups = HashMap::new();
size_groups.insert(content.len() as u64, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(size_groups, hasher, config);
assert_eq!(groups.len(), 1);
assert_eq!(stats.input_files, 2);
assert_eq!(stats.potential_duplicates, 2);
assert_eq!(stats.duplicate_groups, 1);
}
#[test]
fn test_phase2_different_files() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content A is here");
let file2 = create_test_file(&dir, "file2.txt", b"content B is here");
let mut size_groups = HashMap::new();
size_groups.insert(17, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(size_groups, hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 2);
assert_eq!(stats.potential_duplicates, 0);
assert_eq!(stats.unique_prehashes, 2);
}
#[test]
fn test_phase2_mixed_files() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "dup1.txt", b"duplicate content");
let file2 = create_test_file(&dir, "dup2.txt", b"duplicate content");
let file3 = create_test_file(&dir, "unique.txt", b"uniqueee content");
let mut size_groups = HashMap::new();
size_groups.insert(17, vec![file1, file2, file3]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(size_groups, hasher, config);
assert_eq!(groups.len(), 1);
assert_eq!(stats.input_files, 3);
assert_eq!(stats.potential_duplicates, 2);
assert_eq!(stats.unique_prehashes, 1);
}
#[test]
fn test_phase2_handles_missing_file() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "exists.txt", b"real content");
let file2 = make_file_entry(dir.path().join("missing.txt").to_str().unwrap(), 12);
let mut size_groups = HashMap::new();
size_groups.insert(12, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(size_groups, hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 2);
assert_eq!(stats.hashed_files, 1);
assert_eq!(stats.failed_files, 1);
}
#[test]
fn test_phase2_shutdown_flag() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content");
let file2 = create_test_file(&dir, "file2.txt", b"content");
let mut size_groups = HashMap::new();
size_groups.insert(7, vec![file1, file2]);
let shutdown = Arc::new(AtomicBool::new(true)); let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default().with_shutdown_flag(shutdown);
let (_, stats) = phase2_prehash(size_groups, hasher, config);
assert!(stats.interrupted);
}
#[test]
fn test_phase2_multiple_size_groups() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "a1.txt", b"1234567890");
let file2 = create_test_file(&dir, "a2.txt", b"1234567890");
let file3 = create_test_file(&dir, "b1.txt", b"12345");
let file4 = create_test_file(&dir, "b2.txt", b"12345");
let mut size_groups = HashMap::new();
size_groups.insert(10, vec![file1, file2]);
size_groups.insert(5, vec![file3, file4]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let (groups, stats) = phase2_prehash(size_groups, hasher, config);
assert_eq!(groups.len(), 2);
assert_eq!(stats.input_files, 4);
assert_eq!(stats.potential_duplicates, 4);
assert_eq!(stats.duplicate_groups, 2);
}
#[test]
fn test_compute_prehashes() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"test content");
let file2 = create_test_file(&dir, "file2.txt", b"test content");
let mut size_groups = HashMap::new();
size_groups.insert(12, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default();
let entries = compute_prehashes(size_groups, hasher, config);
assert_eq!(entries.len(), 2);
assert_eq!(entries[0].prehash, entries[1].prehash);
}
#[test]
fn test_extract_paths() {
let files = vec![
make_file_entry("/path/a.txt", 100),
make_file_entry("/path/b.txt", 100),
];
let paths = extract_paths(&files);
assert_eq!(paths.len(), 2);
assert_eq!(paths[0], std::path::PathBuf::from("/path/a.txt"));
assert_eq!(paths[1], std::path::PathBuf::from("/path/b.txt"));
}
struct TestProgressCallback {
phase_started: std::sync::Mutex<bool>,
progress_count: std::sync::atomic::AtomicUsize,
phase_ended: std::sync::Mutex<bool>,
}
impl TestProgressCallback {
fn new() -> Self {
Self {
phase_started: std::sync::Mutex::new(false),
progress_count: std::sync::atomic::AtomicUsize::new(0),
phase_ended: std::sync::Mutex::new(false),
}
}
}
impl ProgressCallback for TestProgressCallback {
fn on_phase_start(&self, _phase: &str, _total: usize) {
*self.phase_started.lock().unwrap() = true;
}
fn on_progress(&self, _current: usize, _path: &str) {
self.progress_count
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
}
fn on_phase_end(&self, _phase: &str) {
*self.phase_ended.lock().unwrap() = true;
}
}
#[test]
fn test_phase2_progress_callback() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content");
let file2 = create_test_file(&dir, "file2.txt", b"content");
let mut size_groups = HashMap::new();
size_groups.insert(7, vec![file1, file2]);
let callback = Arc::new(TestProgressCallback::new());
let hasher = Arc::new(Hasher::new());
let config = PrehashConfig::default().with_progress_callback(callback.clone());
let (_, _) = phase2_prehash(size_groups, hasher, config);
assert!(*callback.phase_started.lock().unwrap());
assert!(callback.progress_count.load(Ordering::SeqCst) > 0);
assert!(*callback.phase_ended.lock().unwrap());
}
#[test]
fn test_fullhash_config_default() {
let config = FullhashConfig::default();
assert_eq!(config.io_threads, 4);
assert!(config.shutdown_flag.is_none());
assert!(config.progress_callback.is_none());
}
#[test]
fn test_fullhash_config_builder() {
let shutdown = Arc::new(AtomicBool::new(false));
let config = FullhashConfig::default()
.with_io_threads(8)
.with_shutdown_flag(shutdown.clone());
assert_eq!(config.io_threads, 8);
assert!(config.shutdown_flag.is_some());
}
#[test]
fn test_fullhash_stats_default() {
let stats = FullhashStats::default();
assert_eq!(stats.input_files, 0);
assert_eq!(stats.hashed_files, 0);
assert_eq!(stats.bytes_hashed, 0);
assert_eq!(stats.duplicate_groups, 0);
}
#[test]
fn test_phase3_empty_input() {
let hasher = Arc::new(Hasher::new());
let config = FullhashConfig::default();
let (groups, stats) = phase3_fullhash(HashMap::new(), hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 0);
assert_eq!(stats.duplicate_groups, 0);
}
#[test]
fn test_phase3_identical_files() {
let dir = TempDir::new().unwrap();
let content = b"identical content for testing duplicates";
let file1 = create_test_file(&dir, "file1.txt", content);
let file2 = create_test_file(&dir, "file2.txt", content);
let hasher = Arc::new(Hasher::new());
let prehash = hasher.prehash(&file1.path).unwrap();
let mut prehash_groups = HashMap::new();
prehash_groups.insert(prehash, vec![file1, file2]);
let config = FullhashConfig::default();
let (groups, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert_eq!(groups.len(), 1);
assert_eq!(stats.input_files, 2);
assert_eq!(stats.hashed_files, 2);
assert_eq!(stats.duplicate_groups, 1);
assert_eq!(stats.duplicate_files, 1); assert_eq!(stats.wasted_space, content.len() as u64);
}
#[test]
fn test_phase3_different_content_same_prehash_size() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content A for test");
let file2 = create_test_file(&dir, "file2.txt", b"content B for test");
let fake_prehash = [0u8; 32];
let mut prehash_groups = HashMap::new();
prehash_groups.insert(fake_prehash, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = FullhashConfig::default();
let (groups, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 2);
assert_eq!(stats.hashed_files, 2);
assert_eq!(stats.duplicate_groups, 0);
}
#[test]
fn test_phase3_handles_missing_file() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "exists.txt", b"real content here");
let file2 = make_file_entry(dir.path().join("missing.txt").to_str().unwrap(), 17);
let fake_prehash = [0u8; 32];
let mut prehash_groups = HashMap::new();
prehash_groups.insert(fake_prehash, vec![file1, file2]);
let hasher = Arc::new(Hasher::new());
let config = FullhashConfig::default();
let (groups, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert!(groups.is_empty());
assert_eq!(stats.input_files, 2);
assert_eq!(stats.hashed_files, 1);
assert_eq!(stats.failed_files, 1);
}
#[test]
fn test_phase3_shutdown_flag() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content");
let file2 = create_test_file(&dir, "file2.txt", b"content");
let fake_prehash = [0u8; 32];
let mut prehash_groups = HashMap::new();
prehash_groups.insert(fake_prehash, vec![file1, file2]);
let shutdown = Arc::new(AtomicBool::new(true)); let hasher = Arc::new(Hasher::new());
let config = FullhashConfig::default().with_shutdown_flag(shutdown);
let (_, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert!(stats.interrupted);
}
#[test]
fn test_phase3_multiple_duplicate_groups() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "a1.txt", b"content group A");
let file2 = create_test_file(&dir, "a2.txt", b"content group A");
let file3 = create_test_file(&dir, "b1.txt", b"content group B");
let file4 = create_test_file(&dir, "b2.txt", b"content group B");
let file5 = create_test_file(&dir, "b3.txt", b"content group B");
let hasher = Arc::new(Hasher::new());
let prehash1 = hasher.prehash(&file1.path).unwrap();
let prehash2 = hasher.prehash(&file3.path).unwrap();
let mut prehash_groups = HashMap::new();
prehash_groups.insert(prehash1, vec![file1, file2]);
prehash_groups.insert(prehash2, vec![file3, file4, file5]);
let config = FullhashConfig::default();
let (groups, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert_eq!(groups.len(), 2);
assert_eq!(stats.input_files, 5);
assert_eq!(stats.hashed_files, 5);
assert_eq!(stats.duplicate_groups, 2);
assert_eq!(stats.duplicate_files, 3); }
#[test]
fn test_phase3_bytes_hashed_tracking() {
let dir = TempDir::new().unwrap();
let content = b"test content for byte tracking";
let file1 = create_test_file(&dir, "file1.txt", content);
let file2 = create_test_file(&dir, "file2.txt", content);
let hasher = Arc::new(Hasher::new());
let prehash = hasher.prehash(&file1.path).unwrap();
let mut prehash_groups = HashMap::new();
prehash_groups.insert(prehash, vec![file1, file2]);
let config = FullhashConfig::default();
let (_, stats) = phase3_fullhash(prehash_groups, hasher, config);
assert_eq!(stats.bytes_hashed, (content.len() * 2) as u64);
}
#[test]
fn test_phase3_progress_callback() {
let dir = TempDir::new().unwrap();
let file1 = create_test_file(&dir, "file1.txt", b"content");
let file2 = create_test_file(&dir, "file2.txt", b"content");
let hasher = Arc::new(Hasher::new());
let prehash = hasher.prehash(&file1.path).unwrap();
let mut prehash_groups = HashMap::new();
prehash_groups.insert(prehash, vec![file1, file2]);
let callback = Arc::new(TestProgressCallback::new());
let config = FullhashConfig::default().with_progress_callback(callback.clone());
let (_, _) = phase3_fullhash(prehash_groups, hasher, config);
assert!(*callback.phase_started.lock().unwrap());
assert!(callback.progress_count.load(Ordering::SeqCst) > 0);
assert!(*callback.phase_ended.lock().unwrap());
}
#[test]
fn test_finder_config_default() {
let config = FinderConfig::default();
assert_eq!(config.io_threads, 4);
assert!(!config.paranoid);
assert!(config.shutdown_flag.is_none());
assert!(config.progress_callback.is_none());
}
#[test]
fn test_finder_config_builder() {
let shutdown = Arc::new(AtomicBool::new(false));
let config = FinderConfig::default()
.with_io_threads(8)
.with_paranoid(true)
.with_shutdown_flag(shutdown.clone());
assert_eq!(config.io_threads, 8);
assert!(config.paranoid);
assert!(config.shutdown_flag.is_some());
}
#[test]
fn test_finder_config_io_threads_min() {
let config = FinderConfig::default().with_io_threads(0);
assert_eq!(config.io_threads, 1); }
#[test]
fn test_scan_summary_default() {
let summary = ScanSummary::default();
assert_eq!(summary.total_files, 0);
assert_eq!(summary.total_size, 0);
assert_eq!(summary.duplicate_groups, 0);
assert_eq!(summary.reclaimable_space, 0);
assert!(!summary.interrupted);
}
#[test]
fn test_scan_summary_wasted_percentage() {
let summary = ScanSummary {
total_size: 1000,
reclaimable_space: 250,
..Default::default()
};
assert!((summary.wasted_percentage() - 25.0).abs() < 0.1);
}
#[test]
fn test_scan_summary_wasted_percentage_zero_size() {
let summary = ScanSummary::default();
assert_eq!(summary.wasted_percentage(), 0.0);
}
#[test]
fn test_scan_summary_display() {
let summary = ScanSummary {
total_size: 1_500_000,
reclaimable_space: 500_000,
..Default::default()
};
assert!(summary.total_size_display().contains("MB"));
assert!(summary.reclaimable_display().contains("KB"));
}
#[test]
fn test_format_size_bytes() {
assert_eq!(format_size(500), "500 B");
}
#[test]
fn test_format_size_kilobytes() {
assert!(format_size(1024).contains("KB"));
assert!(format_size(2048).contains("KB"));
}
#[test]
fn test_format_size_megabytes() {
assert!(format_size(1024 * 1024).contains("MB"));
}
#[test]
fn test_format_size_gigabytes() {
assert!(format_size(1024 * 1024 * 1024).contains("GB"));
}
#[test]
fn test_format_size_terabytes() {
assert!(format_size(1024 * 1024 * 1024 * 1024).contains("TB"));
}
#[test]
fn test_duplicate_finder_new() {
let config = FinderConfig::default();
let finder = DuplicateFinder::new(config);
assert!(Arc::strong_count(&finder.hasher) >= 1);
}
#[test]
fn test_duplicate_finder_with_defaults() {
let finder = DuplicateFinder::with_defaults();
assert!(Arc::strong_count(&finder.hasher) >= 1);
}
#[test]
fn test_find_duplicates_path_not_found() {
let finder = DuplicateFinder::with_defaults();
let result = finder.find_duplicates(std::path::Path::new("/nonexistent/path/12345"));
assert!(result.is_err());
match result.unwrap_err() {
FinderError::PathNotFound(p) => {
assert!(p.to_string_lossy().contains("nonexistent"));
}
_ => panic!("Expected PathNotFound error"),
}
}
#[test]
fn test_find_duplicates_not_a_directory() {
let dir = TempDir::new().unwrap();
let file = dir.path().join("file.txt");
std::fs::write(&file, "content").unwrap();
let finder = DuplicateFinder::with_defaults();
let result = finder.find_duplicates(&file);
assert!(result.is_err());
match result.unwrap_err() {
FinderError::NotADirectory(_) => {}
_ => panic!("Expected NotADirectory error"),
}
}
#[test]
fn test_find_duplicates_empty_directory() {
let dir = TempDir::new().unwrap();
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder.find_duplicates(dir.path()).unwrap();
assert!(groups.is_empty());
assert_eq!(summary.total_files, 0);
assert_eq!(summary.duplicate_groups, 0);
assert!(!summary.interrupted);
}
#[test]
fn test_find_duplicates_no_duplicates() {
let dir = TempDir::new().unwrap();
std::fs::write(dir.path().join("a.txt"), "content a").unwrap();
std::fs::write(dir.path().join("b.txt"), "content b").unwrap();
std::fs::write(dir.path().join("c.txt"), "content c").unwrap();
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder.find_duplicates(dir.path()).unwrap();
assert!(groups.is_empty());
assert_eq!(summary.total_files, 3);
assert_eq!(summary.duplicate_groups, 0);
assert_eq!(summary.reclaimable_space, 0);
}
#[test]
fn test_find_duplicates_with_duplicates() {
let dir = TempDir::new().unwrap();
let content = b"identical content for duplicate detection";
std::fs::write(dir.path().join("dup1.txt"), content).unwrap();
std::fs::write(dir.path().join("dup2.txt"), content).unwrap();
std::fs::write(dir.path().join("unique.txt"), "different content").unwrap();
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder.find_duplicates(dir.path()).unwrap();
assert_eq!(groups.len(), 1);
assert_eq!(summary.total_files, 3);
assert_eq!(summary.duplicate_groups, 1);
assert_eq!(summary.duplicate_files, 1); assert_eq!(summary.reclaimable_space, content.len() as u64);
}
#[test]
fn test_find_duplicates_multiple_groups() {
let dir = TempDir::new().unwrap();
std::fs::write(dir.path().join("a1.txt"), "group A content").unwrap();
std::fs::write(dir.path().join("a2.txt"), "group A content").unwrap();
std::fs::write(dir.path().join("b1.txt"), "group B content").unwrap();
std::fs::write(dir.path().join("b2.txt"), "group B content").unwrap();
std::fs::write(dir.path().join("b3.txt"), "group B content").unwrap();
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder.find_duplicates(dir.path()).unwrap();
assert_eq!(groups.len(), 2);
assert_eq!(summary.total_files, 5);
assert_eq!(summary.duplicate_groups, 2);
assert_eq!(summary.duplicate_files, 3); }
#[test]
fn test_find_duplicates_shutdown_flag() {
let dir = TempDir::new().unwrap();
std::fs::write(dir.path().join("a.txt"), "content").unwrap();
let shutdown = Arc::new(AtomicBool::new(true)); let config = FinderConfig::default().with_shutdown_flag(shutdown);
let finder = DuplicateFinder::new(config);
let result = finder.find_duplicates(dir.path());
assert!(result.is_err());
match result.unwrap_err() {
FinderError::Interrupted => {}
_ => panic!("Expected Interrupted error"),
}
}
#[test]
fn test_find_duplicates_from_files_empty() {
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder.find_duplicates_from_files(vec![]).unwrap();
assert!(groups.is_empty());
assert_eq!(summary.total_files, 0);
}
#[test]
fn test_find_duplicates_from_files_with_duplicates() {
let dir = TempDir::new().unwrap();
let content = b"identical content";
let file1 = create_test_file(&dir, "dup1.txt", content);
let file2 = create_test_file(&dir, "dup2.txt", content);
let finder = DuplicateFinder::with_defaults();
let (groups, summary) = finder
.find_duplicates_from_files(vec![file1, file2])
.unwrap();
assert_eq!(groups.len(), 1);
assert_eq!(summary.duplicate_groups, 1);
assert_eq!(summary.reclaimable_space, content.len() as u64);
}
#[test]
fn test_find_duplicates_summary_timing() {
let dir = TempDir::new().unwrap();
std::fs::write(dir.path().join("a.txt"), "content").unwrap();
let finder = DuplicateFinder::with_defaults();
let (_, summary) = finder.find_duplicates(dir.path()).unwrap();
assert!(summary.scan_duration.as_nanos() > 0);
}
#[test]
fn test_find_duplicates_with_progress_callback() {
let dir = TempDir::new().unwrap();
let content = b"identical content for testing";
std::fs::write(dir.path().join("dup1.txt"), content).unwrap();
std::fs::write(dir.path().join("dup2.txt"), content).unwrap();
let callback = Arc::new(TestProgressCallback::new());
let config = FinderConfig::default().with_progress_callback(callback.clone());
let finder = DuplicateFinder::new(config);
let (groups, _) = finder.find_duplicates(dir.path()).unwrap();
assert_eq!(groups.len(), 1);
assert!(*callback.phase_started.lock().unwrap());
assert!(*callback.phase_ended.lock().unwrap());
}
}