use anyhow::Result;
use glob::{Pattern, PatternError};
use num_cpus;
use rayon::prelude::*;
use sha1::Digest;
use std::collections::HashMap;
use std::fs::{self, File};
use std::hash::Hasher;
use std::io::{BufRead, BufReader, Read};
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::SystemTime;
use walkdir::WalkDir;
use crate::tui_app::ScanMessage;
use crate::Cli;
use std::sync::mpsc::Sender as StdMpscSender;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortCriterion {
FileName,
FileSize,
CreatedAt,
ModifiedAt,
PathLength,
}
impl FromStr for SortCriterion {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"name" => Ok(Self::FileName),
"size" => Ok(Self::FileSize),
"created" | "created_at" => Ok(Self::CreatedAt),
"modified" | "modified_at" => Ok(Self::ModifiedAt),
"path_length" => Ok(Self::PathLength),
_ => Err(anyhow::anyhow!("Invalid sort criterion: {}", s)),
}
}
}
impl std::fmt::Display for SortCriterion {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::FileName => write!(f, "name"),
Self::FileSize => write!(f, "size"),
Self::CreatedAt => write!(f, "createdat"),
Self::ModifiedAt => write!(f, "modified"),
Self::PathLength => write!(f, "path"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SortOrder {
Ascending,
Descending,
}
impl FromStr for SortOrder {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"asc" | "ascending" => Ok(Self::Ascending),
"desc" | "descending" => Ok(Self::Descending),
_ => Err(anyhow::anyhow!("Invalid sort order: {}", s)),
}
}
}
impl std::fmt::Display for SortOrder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Ascending => write!(f, "ascending"),
Self::Descending => write!(f, "descending"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub struct FileInfo {
pub path: PathBuf,
pub size: u64,
pub hash: Option<String>,
pub modified_at: Option<SystemTime>,
pub created_at: Option<SystemTime>,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct DuplicateSet {
pub files: Vec<FileInfo>,
pub size: u64,
pub hash: String,
}
#[derive(serde::Serialize, Debug)] struct HashEntryContent {
size: u64,
files: Vec<PathBuf>,
}
#[derive(Debug, Default)]
pub struct FilterRules {
includes: Vec<Pattern>,
excludes: Vec<Pattern>,
}
impl FilterRules {
pub fn new(cli: &Cli) -> Result<Self> {
let mut rules = FilterRules::default();
if let Some(filter_file_path) = &cli.filter_from {
log::info!("Loading filter rules from: {:?}", filter_file_path);
let file = File::open(filter_file_path).map_err(|e| {
anyhow::anyhow!("Failed to open filter file {:?}: {}", filter_file_path, e)
})?;
let reader = BufReader::new(file);
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result
.map_err(|e| anyhow::anyhow!("Failed to read line from filter file: {}", e))?;
let trimmed_line = line.trim();
if trimmed_line.is_empty()
|| trimmed_line.starts_with('#')
|| trimmed_line.starts_with(';')
{
continue;
}
if let Some(pattern_str) = trimmed_line.strip_prefix("+ ") {
rules.add_include(pattern_str.trim())?;
} else if let Some(pattern_str) = trimmed_line.strip_prefix("- ") {
rules.add_exclude(pattern_str.trim())?;
} else {
log::warn!(
"Invalid line in filter file {:?} at line {}: {}",
filter_file_path,
line_num + 1,
trimmed_line
);
}
}
}
for pattern_str in &cli.include {
rules.add_include(pattern_str)?;
}
for pattern_str in &cli.exclude {
rules.add_exclude(pattern_str)?;
}
if !rules.includes.is_empty() {
log::info!(
"Include rules active: {}",
rules
.includes
.iter()
.map(|p| p.as_str())
.collect::<Vec<_>>()
.join(", ")
);
}
if !rules.excludes.is_empty() {
log::info!(
"Exclude rules active: {}",
rules
.excludes
.iter()
.map(|p| p.as_str())
.collect::<Vec<_>>()
.join(", ")
);
}
Ok(rules)
}
fn add_include(&mut self, pattern_str: &str) -> Result<(), PatternError> {
match Pattern::new(pattern_str) {
Ok(p) => {
self.includes.push(p);
Ok(())
}
Err(e) => {
log::error!("Invalid include glob pattern '{}': {}", pattern_str, e);
Err(e)
}
}
}
fn add_exclude(&mut self, pattern_str: &str) -> Result<(), PatternError> {
match Pattern::new(pattern_str) {
Ok(p) => {
self.excludes.push(p);
Ok(())
}
Err(e) => {
log::error!("Invalid exclude glob pattern '{}': {}", pattern_str, e);
Err(e)
}
}
}
pub fn is_match(&self, path_str: &str) -> bool {
if self.excludes.iter().any(|p| p.matches(path_str)) {
return false;
}
if !self.includes.is_empty() {
return self.includes.iter().any(|p| p.matches(path_str));
}
true }
}
pub fn calculate_hash(path: &Path, algorithm: &str) -> Result<String> {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)?;
match algorithm {
"md5" => {
let digest = md5::compute(&buffer);
Ok(format!("{:x}", digest))
}
"sha1" => {
let mut hasher = sha1::Sha1::new();
hasher.update(&buffer);
Ok(format!("{:x}", hasher.finalize()))
}
"sha256" => {
let mut hasher = sha2::Sha256::new();
hasher.update(&buffer);
Ok(format!("{:x}", hasher.finalize()))
}
"blake3" => {
let hash = blake3::hash(&buffer);
Ok(hash.to_hex().to_string())
}
"xxhash" => {
let mut hasher = twox_hash::XxHash64::default();
hasher.write(&buffer);
Ok(format!("{:016x}", hasher.finish()))
}
#[cfg(feature = "linux")]
"gxhash" => {
let mut hasher = gxhash::GxHasher::default();
hasher.write(&buffer);
Ok(format!("{:016x}", hasher.finish()))
}
#[cfg(not(feature = "linux"))]
"gxhash" => Err(anyhow::anyhow!(
"gxhash is only available on Linux platforms"
)),
"fnv1a" => {
let mut hasher = fnv::FnvHasher::default();
hasher.write(&buffer);
Ok(format!("{:016x}", hasher.finish()))
}
"crc32" => {
let result = crc32fast::hash(&buffer);
Ok(format!("{:08x}", result))
}
_ => Err(anyhow::anyhow!("Invalid hash algorithm: {}", algorithm)),
}
}
pub fn find_duplicate_files_with_progress(
cli: &Cli,
tx_progress: StdMpscSender<ScanMessage>,
) -> Result<Vec<DuplicateSet>> {
let tx_progress_for_media = tx_progress.clone();
log::info!(
"[ScanThread] Starting scan with progress updates for directory: {:?}",
cli.directories[0]
);
let filter_rules = FilterRules::new(cli)?;
let file_cache = if cli.fast_mode && cli.cache_location.is_some() {
let cache_dir = cli.cache_location.as_ref().unwrap();
match crate::file_cache::FileCache::new(cache_dir, &cli.algorithm) {
Ok(cache) => {
log::info!(
"[ScanThread] Using file cache at {:?} with {} entries",
cache_dir,
cache.len()
);
Some(std::sync::Arc::new(std::sync::Mutex::new(cache)))
}
Err(e) => {
log::warn!("[ScanThread] Failed to initialize file cache: {}", e);
None
}
}
} else {
None
};
let cache_hits = std::sync::atomic::AtomicUsize::new(0);
let send_status = move |stage: u8, msg: String| {
if tx_progress
.send(ScanMessage::StatusUpdate(stage, msg))
.is_err()
{
log::warn!("[ScanThread] Failed to send status update to TUI (channel closed).");
}
};
send_status(
0,
format!(
"Pre-scan: Counting files in {}",
cli.directories[0].display()
),
);
let total_files = match count_files_in_directory(&cli.directories[0], &filter_rules) {
Ok(count) => {
send_status(0, format!("Pre-scan complete: Found {} total files", count));
count
}
Err(e) => {
log::warn!("[ScanThread] Failed to count files: {}", e);
send_status(0, format!("Pre-scan failed: {}", e));
0 }
};
send_status(
1,
format!(
"Stage 1/3: 📁 Starting file discovery in {} (0/{} files)",
cli.directories[0].display(),
if total_files > 0 {
total_files.to_string()
} else {
"?".to_string()
}
),
);
let mut files_by_size: HashMap<u64, Vec<PathBuf>> = HashMap::new();
let walker = WalkDir::new(&cli.directories[0]).into_iter();
let mut files_scanned_count = 0;
let mut last_update_time = std::time::Instant::now();
let update_interval = std::time::Duration::from_millis(400);
for entry in walker
.filter_entry(|e| {
if is_hidden(e) || is_symlink(e) {
return false;
}
if let Some(path_str) = e.path().to_str() {
filter_rules.is_match(path_str)
} else {
log::warn!(
"[ScanThread] Path {:?} is not valid UTF-8, excluding.",
e.path()
);
false
}
})
.flatten()
{
if entry.file_type().is_file() {
let path = entry.path().to_path_buf();
files_scanned_count += 1;
let should_update = if files_scanned_count < 100 {
files_scanned_count % 10 == 0
} else if files_scanned_count < 500 {
files_scanned_count % 20 == 0
} else if files_scanned_count < 1000 {
files_scanned_count % 50 == 0
} else if files_scanned_count < 5000 {
files_scanned_count % 100 == 0
} else if files_scanned_count < 10000 {
files_scanned_count % 200 == 0
} else if files_scanned_count < 50000 {
files_scanned_count % 500 == 0
} else {
files_scanned_count % 1000 == 0
};
if should_update || last_update_time.elapsed() >= update_interval {
last_update_time = std::time::Instant::now();
if total_files > 0 {
let percent = (files_scanned_count as f64 / total_files as f64) * 100.0;
send_status(
1,
format!(
"Stage 1/3: 📁 Scanning files: {}/{} ({:.1}%)",
files_scanned_count, total_files, percent
),
);
} else {
send_status(
1,
format!("Stage 1/3: 📁 Found {} files...", files_scanned_count),
);
}
}
match fs::metadata(&path) {
Ok(metadata) => {
if metadata.len() > 0 {
files_by_size.entry(metadata.len()).or_default().push(path);
}
}
Err(e) => {
log::warn!("[ScanThread] Failed to get metadata for {:?}: {}", path, e)
}
}
}
}
let file_count = files_by_size.values().map(|v| v.len()).sum::<usize>();
let size_group_count = files_by_size.len();
if total_files > 0 {
let percent_found = (files_scanned_count as f64 / total_files as f64) * 100.0;
send_status(
1,
format!(
"Stage 1/3: 📁 File discovery complete. Found {} files ({:.1}%) in {} size groups.",
file_count, percent_found, size_group_count
),
);
} else {
send_status(
1,
format!(
"Stage 1/3: 📁 File discovery complete. Found {} files in {} size groups.",
file_count, size_group_count
),
);
}
log::info!(
"[ScanThread] Found {} files matching criteria, grouped into {} unique file sizes.",
file_count,
size_group_count
);
let mut duplicate_sets: Vec<DuplicateSet> = Vec::new();
let potential_duplicates: Vec<_> = files_by_size
.into_iter()
.filter(|(_, paths)| paths.len() > 1)
.collect();
let _potential_duplicate_count = potential_duplicates.len();
if potential_duplicates.is_empty() {
send_status(
3,
"Scan complete. No potential duplicates found.".to_string(),
);
log::info!("[ScanThread] No potential duplicates found after size grouping.");
if cli.media_mode && cli.media_dedup_options.enabled {
let tx_clone = tx_progress_for_media.clone();
return find_similar_media_files_with_progress(cli, tx_clone);
}
return Ok(Vec::new());
}
let potential_groups = potential_duplicates.len();
let potential_files: usize = potential_duplicates
.iter()
.map(|(_, paths)| paths.len())
.sum();
send_status(
2,
format!(
"Stage 2/3: 🔍 Found {} size groups with {} potential duplicate files",
potential_groups, potential_files
),
);
log::info!(
"[ScanThread] Found {} sizes with potential duplicates. Calculating hashes...",
potential_groups
);
let num_threads = cli.parallel.unwrap_or_else(num_cpus::get);
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.build()?;
log::info!("[ScanThread] Using {} threads for hashing.", num_threads);
let (local_tx, local_rx) = std::sync::mpsc::channel::<Result<HashMap<String, Vec<FileInfo>>>>();
let total_groups_to_hash = potential_duplicates.len();
let mut groups_hashed_count = 0;
let total_files_to_hash = potential_duplicates
.iter()
.map(|(_, paths)| paths.len())
.sum::<usize>();
send_status(
3,
format!(
"Stage 3/3: 🔄 Hashing {} files across {} size groups (using {} threads)...",
total_files_to_hash, total_groups_to_hash, num_threads
),
);
let mut all_file_infos = Vec::new();
pool.install(|| {
potential_duplicates
.into_par_iter()
.for_each_with(local_tx, |thread_local_tx, (size, paths)| {
let mut hashes_in_group: HashMap<String, Vec<FileInfo>> = HashMap::new();
let mut thread_cache_hits = 0;
for path in paths {
let mut hash_from_cache = None;
if let Some(cache) = file_cache.as_ref() {
if let Ok(cache_guard) = cache.lock() {
hash_from_cache = cache_guard.get_file_info(&path);
if hash_from_cache.is_some() {
thread_cache_hits += 1;
}
}
}
match hash_from_cache {
Some(file_info) => {
if let Some(hash_str) = &file_info.hash {
hashes_in_group.entry(hash_str.clone()).or_default().push(file_info);
}
},
None => match calculate_hash(&path, &cli.algorithm) {
Ok(hash_str) => {
let metadata = match fs::metadata(&path) {
Ok(m) => m,
Err(e) => {
log::warn!("Failed to get metadata for {:?}: {}", path, e);
continue;
}
};
let file_info = FileInfo {
path: path.clone(),
size,
hash: Some(hash_str.clone()),
modified_at: metadata.modified().ok(),
created_at: metadata.created().ok(),
};
if let Some(cache) = &file_cache {
if let Ok(mut cache_guard) = cache.lock() {
let _ = cache_guard.store(&file_info, &cli.algorithm);
}
}
hashes_in_group.entry(hash_str).or_default().push(file_info);
}
Err(e) => {
log::warn!("[ScanThread] Failed to hash {:?}: {}", path, e);
if thread_local_tx.send(Err(e)).is_err() {
log::error!("[ScanThread] Hashing thread failed to send error (channel closed).");
}
return;
}
}
}
}
if thread_cache_hits > 0 {
cache_hits.fetch_add(thread_cache_hits, std::sync::atomic::Ordering::Relaxed);
}
if thread_local_tx.send(Ok(hashes_in_group)).is_err() {
log::error!("[ScanThread] Hashing thread failed to send result (channel closed).");
}
});
});
let mut actual_duplicate_sets = 0;
for i in 0..total_groups_to_hash {
match local_rx.recv() {
Ok(Ok(hashed_group)) => {
for (hash, file_infos_vec) in hashed_group {
if cli.media_mode {
all_file_infos.extend(file_infos_vec.iter().cloned());
}
if file_infos_vec.len() > 1 {
actual_duplicate_sets += 1;
let first_file_size = file_infos_vec[0].size; duplicate_sets.push(DuplicateSet {
files: file_infos_vec, size: first_file_size,
hash,
});
}
}
}
Ok(Err(e)) => {
log::error!("[ScanThread] Error hashing a file group: {}", e);
}
Err(e) => {
log::error!(
"[ScanThread] Failed to receive all hash results: {}. Processed {} of {}.",
e,
i,
total_groups_to_hash
);
return Err(anyhow::anyhow!(
"Hashing phase failed due to channel error: {}",
e
));
}
}
groups_hashed_count += 1;
let should_update = if total_groups_to_hash < 20 {
true } else if total_groups_to_hash < 100 {
groups_hashed_count % 5 == 0 || groups_hashed_count == total_groups_to_hash
} else if total_groups_to_hash < 500 {
groups_hashed_count % 10 == 0 || groups_hashed_count == total_groups_to_hash
} else {
groups_hashed_count % 20 == 0 || groups_hashed_count == total_groups_to_hash
};
if should_update || last_update_time.elapsed() >= update_interval {
last_update_time = std::time::Instant::now();
let progress_percent =
(groups_hashed_count as f64 / total_groups_to_hash as f64) * 100.0;
let cache_status = if cache_hits.load(std::sync::atomic::Ordering::Relaxed) > 0 {
format!(
" ({} from cache)",
cache_hits.load(std::sync::atomic::Ordering::Relaxed)
)
} else {
"".to_string()
};
send_status(
3,
format!(
"Stage 3/3: 🔄 Hashed {}/{} groups ({:.1}%){}... Found {} duplicate sets",
groups_hashed_count,
total_groups_to_hash,
progress_percent,
cache_status,
actual_duplicate_sets
),
);
}
}
if let Some(cache) = &file_cache {
if let Ok(mut cache_guard) = cache.lock() {
if let Err(e) = cache_guard.save() {
log::warn!("[ScanThread] Failed to save file cache: {}", e);
} else if cache_hits.load(std::sync::atomic::Ordering::Relaxed) > 0 {
log::info!(
"[ScanThread] Saved cache with {} entries ({} cache hits during scan)",
cache_guard.len(),
cache_hits.load(std::sync::atomic::Ordering::Relaxed)
);
}
}
}
let message = if cache_hits.load(std::sync::atomic::Ordering::Relaxed) > 0 {
format!(
"All stages complete. Found {} sets of duplicate files. Used {} cached hashes.",
duplicate_sets.len(),
cache_hits.load(std::sync::atomic::Ordering::Relaxed)
)
} else {
format!(
"All stages complete. Found {} sets of duplicate files.",
duplicate_sets.len()
)
};
send_status(3, message);
log::info!(
"[ScanThread] Found {} sets of duplicate files.",
duplicate_sets.len()
);
if cli.media_mode && cli.media_dedup_options.enabled {
log::info!("Media mode is enabled but placeholder implementation");
}
Ok(duplicate_sets)
}
fn find_similar_media_files_with_progress(
cli: &Cli,
tx_progress: StdMpscSender<ScanMessage>,
) -> Result<Vec<DuplicateSet>> {
let send_status = move |stage: u8, msg: String| {
if tx_progress
.send(ScanMessage::StatusUpdate(stage, msg))
.is_err()
{
log::warn!("[ScanThread] Failed to send status update to TUI (channel closed).");
}
};
send_status(4, "Starting media similarity detection...".to_string());
let filter_rules = FilterRules::new(cli)?;
send_status(
4,
format!(
"Scanning directory for media files: {}",
cli.directories[0].display()
),
);
let mut file_infos = Vec::new();
let walker = WalkDir::new(&cli.directories[0]).into_iter();
for entry in walker
.filter_entry(|e| {
if is_hidden(e) || is_symlink(e) {
return false;
}
if let Some(path_str) = e.path().to_str() {
filter_rules.is_match(path_str)
} else {
log::warn!(
"[ScanThread] Path {:?} is not valid UTF-8, excluding.",
e.path()
);
false
}
})
.flatten()
{
if entry.file_type().is_file() {
let path = entry.path().to_path_buf();
match fs::metadata(&path) {
Ok(metadata) => {
if metadata.len() > 0 {
let file_info = FileInfo {
path: path.clone(),
size: metadata.len(),
hash: None, modified_at: metadata.modified().ok(),
created_at: metadata.created().ok(),
};
file_infos.push(file_info);
}
}
Err(e) => {
log::warn!("[ScanThread] Failed to get metadata for {:?}: {}", path, e)
}
}
}
}
let mut media_files: Vec<crate::media_dedup::MediaFileInfo> = Vec::new();
let mut processed = 0;
let total_files = file_infos.len();
for file_info in &file_infos {
let mut media_file = crate::media_dedup::MediaFileInfo::from(file_info.clone());
let media_kind = crate::media_dedup::detect_media_type(&file_info.path);
if media_kind != crate::media_dedup::MediaKind::Unknown {
media_file.metadata = match crate::media_dedup::extract_media_metadata(&file_info.path)
{
Ok(metadata) => Some(metadata),
Err(e) => {
log::warn!(
"[ScanThread] Failed to extract media metadata for {:?}: {}",
file_info.path,
e
);
None
}
};
}
processed += 1;
send_status(
4,
format!(
"Processing media files: {}/{} ({:.1}%)",
processed,
total_files,
(processed as f64 / total_files as f64) * 100.0
),
);
if media_file.metadata.is_some() {
media_files.push(media_file);
}
}
log::info!(
"[ScanThread] Extracted metadata for {} media files",
media_files.len()
);
let mut similar_groups: Vec<Vec<crate::media_dedup::MediaFileInfo>> = Vec::new();
crate::media_dedup::process_media_type_similarity(
&media_files.iter().collect::<Vec<_>>(),
&cli.media_dedup_options,
&mut similar_groups,
)?;
let duplicate_sets =
crate::media_dedup::convert_to_duplicate_sets(&similar_groups, &cli.media_dedup_options);
log::info!(
"[ScanThread] Found {} sets of similar media files.",
duplicate_sets.len()
);
send_status(
4,
format!(
"Media analysis complete. Found {} sets of similar media files.",
duplicate_sets.len()
),
);
Ok(duplicate_sets)
}
fn is_hidden(entry: &walkdir::DirEntry) -> bool {
entry
.file_name()
.to_str()
.map(|s| s.starts_with('.'))
.unwrap_or(false)
}
fn is_symlink(entry: &walkdir::DirEntry) -> bool {
entry.file_type().is_symlink()
}
pub fn output_duplicates(
duplicate_sets: &[DuplicateSet],
output_path: &Path,
format: &str,
) -> Result<()> {
log::info!(
"Preparing to write {} duplicate sets to {:?} in {} format",
duplicate_sets.len(),
output_path,
format
);
let mut output_map: HashMap<String, HashEntryContent> = HashMap::new();
for set in duplicate_sets {
if set.files.len() >= 2 {
let file_paths: Vec<PathBuf> = set.files.iter().map(|f| f.path.clone()).collect();
output_map.insert(
set.hash.clone(),
HashEntryContent {
size: set.size,
files: file_paths,
},
);
}
}
if output_map.is_empty() {
log::info!("No duplicate sets with 2 or more files to output.");
return Ok(());
}
let output_content = match format {
"json" => serde_json::to_string_pretty(&output_map)?,
"toml" => toml::to_string_pretty(&output_map)?,
_ => {
return Err(anyhow::anyhow!(
"Unsupported output format: {}. Supported formats are json, toml.",
format
));
}
};
if let Some(parent) = output_path.parent() {
if !parent.exists() {
fs::create_dir_all(parent)?;
log::info!("Created parent directory for output file: {:?}", parent);
}
}
fs::write(output_path, output_content)?;
log::info!(
"Successfully wrote duplicate information to {:?}",
output_path
);
Ok(())
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SelectionStrategy {
ShortestPath,
LongestPath,
NewestModified,
OldestModified,
}
impl FromStr for SelectionStrategy {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"shortest_path" => Ok(Self::ShortestPath),
"longest_path" => Ok(Self::LongestPath),
"newest_modified" => Ok(Self::NewestModified),
"oldest_modified" => Ok(Self::OldestModified),
_ => Err(anyhow::anyhow!("Invalid selection strategy: {}", s)),
}
}
}
pub fn determine_action_targets(
set: &DuplicateSet,
strategy: SelectionStrategy,
) -> Result<(FileInfo, Vec<FileInfo>)> {
if set.files.len() < 2 {
return Err(anyhow::anyhow!(
"Cannot determine action targets for a set with less than 2 files."
));
}
let mut files = set.files.clone();
let kept_file_info = match strategy {
SelectionStrategy::ShortestPath => files
.into_iter()
.min_by_key(|f| f.path.as_os_str().len())
.unwrap(), SelectionStrategy::LongestPath => files
.into_iter()
.max_by_key(|f| f.path.as_os_str().len())
.unwrap(), SelectionStrategy::NewestModified => {
files.sort_by_key(|f| {
fs::metadata(&f.path)
.and_then(|m| m.modified())
.map(std::cmp::Reverse)
.unwrap_or_else(|_| std::cmp::Reverse(std::time::SystemTime::UNIX_EPOCH))
});
files.remove(0) }
SelectionStrategy::OldestModified => {
files.sort_by_key(|f| {
fs::metadata(&f.path)
.and_then(|m| m.modified())
.unwrap_or_else(|_| std::time::SystemTime::now())
});
files.remove(0) }
};
let mut files_to_process: Vec<FileInfo> = Vec::new();
for file_info in &set.files {
if file_info.path != kept_file_info.path {
files_to_process.push(file_info.clone());
}
}
Ok((kept_file_info, files_to_process))
}
pub fn delete_files(files_to_delete: &[FileInfo], dry_run: bool) -> Result<(usize, Vec<String>)> {
let mut count = 0;
let mut logs = Vec::new();
if dry_run {
logs.push("[DRY RUN] Would delete the following files:".to_string());
for file_info in files_to_delete {
logs.push(format!("[DRY RUN] - {}", file_info.path.display()));
count += 1;
}
} else {
logs.push("Deleting the following files:".to_string());
for file_info in files_to_delete {
match fs::remove_file(&file_info.path) {
Ok(_) => {
logs.push(format!("Deleted: {}", file_info.path.display()));
count += 1;
}
Err(e) => {
logs.push(format!(
"Error deleting {}: {}",
file_info.path.display(),
e
));
}
}
}
}
Ok((count, logs))
}
pub fn move_files(
files_to_move: &[FileInfo],
target_dir: &Path,
dry_run: bool,
) -> Result<(usize, Vec<String>)> {
let mut count = 0;
let mut logs = Vec::new();
if !target_dir.exists() {
if dry_run {
logs.push(format!(
"[DRY RUN] Target directory {} does not exist. Would create it.",
target_dir.display()
));
log::info!(
"[DRY RUN] Target directory {:?} does not exist. Would attempt to create it.",
target_dir
);
} else {
log::info!(
"Target directory {:?} does not exist. Creating it.",
target_dir
);
fs::create_dir_all(target_dir)?;
logs.push(format!(
"Created target directory: {}",
target_dir.display()
));
}
} else if !target_dir.is_dir() {
return Err(anyhow::anyhow!(
"Target move path {:?} exists but is not a directory.",
target_dir
));
}
if dry_run {
logs.push(format!(
"[DRY RUN] Would move the following files to {}:",
target_dir.display()
));
for file_info in files_to_move {
let target_path = target_dir.join(
file_info
.path
.file_name()
.unwrap_or_else(|| file_info.path.as_os_str()),
);
logs.push(format!(
"[DRY RUN] - {} -> {}",
file_info.path.display(),
target_path.display()
));
log::info!("[DRY RUN] - {:?} -> {:?}", file_info.path, target_path);
count += 1;
}
} else {
logs.push(format!(
"Moving the following files to {}:",
target_dir.display()
));
for file_info in files_to_move {
let file_name = file_info
.path
.file_name()
.unwrap_or_else(|| file_info.path.as_os_str());
let mut target_path = target_dir.join(file_name);
let mut counter = 1;
while target_path.exists() {
let stem = target_path
.file_stem()
.unwrap_or_default()
.to_string_lossy();
let ext = target_path
.extension()
.unwrap_or_default()
.to_string_lossy();
let new_name = format!(
"{}_copy({}){}{}",
stem.trim_end_matches(&format!("_copy({})", counter - 1))
.trim_end_matches("_copy"),
counter,
if ext.is_empty() { "" } else { "." },
ext
);
target_path = target_dir.join(new_name);
counter += 1;
}
match fs::rename(&file_info.path, &target_path) {
Ok(_) => {
logs.push(format!(
"Moved: {} -> {}",
file_info.path.display(),
target_path.display()
));
log::info!(" Moved: {:?} -> {:?}", file_info.path, target_path);
count += 1;
}
Err(e) => {
let error_msg = format!("Error moving {}: {}", file_info.path.display(), e);
logs.push(error_msg);
log::error!(
"Failed to move {:?} to {:?}: {}",
file_info.path,
target_path,
e
);
}
}
}
}
Ok((count, logs))
}
pub(crate) fn sort_file_infos(files: &mut [FileInfo], criterion: SortCriterion, order: SortOrder) {
files.sort_by(|a, b| {
let mut comparison = match criterion {
SortCriterion::FileName => a.path.file_name().cmp(&b.path.file_name()),
SortCriterion::FileSize => a.size.cmp(&b.size),
SortCriterion::CreatedAt => a.created_at.cmp(&b.created_at), SortCriterion::ModifiedAt => a.modified_at.cmp(&b.modified_at), SortCriterion::PathLength => a.path.as_os_str().len().cmp(&b.path.as_os_str().len()),
};
if order == SortOrder::Descending {
comparison = comparison.reverse();
}
comparison
});
}
pub struct DirectoryComparisonResult {
pub missing_in_target: Vec<FileInfo>, pub duplicates: Vec<DuplicateSet>, }
pub fn determine_target_directory(cli: &Cli) -> Result<PathBuf> {
if let Some(target) = &cli.target {
if !target.exists() {
return Err(anyhow::anyhow!(
"Specified target directory does not exist: {:?}",
target
));
}
if !target.is_dir() {
return Err(anyhow::anyhow!(
"Specified target path is not a directory: {:?}",
target
));
}
Ok(target.clone())
} else if cli.directories.len() > 1 {
let target = cli.directories.last().unwrap().clone();
if !target.exists() {
return Err(anyhow::anyhow!(
"Last specified directory (default target) does not exist: {:?}",
target
));
}
if !target.is_dir() {
return Err(anyhow::anyhow!(
"Last specified path is not a directory: {:?}",
target
));
}
Ok(target)
} else {
Err(anyhow::anyhow!(
"No target directory specified and only one directory provided"
))
}
}
pub fn get_source_directories(cli: &Cli, target: &Path) -> Vec<PathBuf> {
if let Some(t) = &cli.target {
cli.directories
.iter()
.filter(|d| d != &t)
.cloned()
.collect()
} else {
cli.directories
.iter()
.filter(|d| d.as_path() != target)
.cloned()
.collect()
}
}
pub fn compare_directories(cli: &Cli) -> Result<DirectoryComparisonResult> {
let target_dir = determine_target_directory(cli)?;
let source_dirs = get_source_directories(cli, &target_dir);
log::info!(
"Comparing directories: Sources: {:?}, Target: {:?}",
source_dirs,
target_dir
);
if source_dirs.is_empty() {
return Err(anyhow::anyhow!("No source directories specified"));
}
let mut target_cli = cli.clone();
target_cli.directories = vec![target_dir.clone()];
log::info!("Scanning target directory: {:?}", target_dir);
let target_files = scan_directory(&target_cli, &target_dir)?;
log::info!("Found {} files in target directory", target_files.len());
let target_hash_map: HashMap<String, &FileInfo> = target_files
.iter()
.filter_map(|file| file.hash.as_ref().map(|hash| (hash.clone(), file)))
.collect();
let mut missing_files = Vec::new();
let mut all_duplicate_sets = Vec::new();
for source_dir in &source_dirs {
log::info!("Scanning source directory: {:?}", source_dir);
let mut source_cli = cli.clone();
source_cli.directories = vec![source_dir.clone()];
let source_files = scan_directory(&source_cli, source_dir)?;
log::info!(
"Found {} files in source directory: {:?}",
source_files.len(),
source_dir
);
for file in &source_files {
if let Some(hash) = &file.hash {
if !target_hash_map.contains_key(hash) {
missing_files.push(file.clone());
log::debug!("File missing in target: {:?}", file.path);
}
if cli.deduplicate {
}
}
}
}
if cli.deduplicate {
let mut all_dirs_cli = cli.clone();
let mut all_dirs = source_dirs.clone();
all_dirs.push(target_dir.clone());
all_dirs_cli.directories = all_dirs;
log::info!("Finding duplicates across all directories for deduplication");
let (tx, _rx) = std::sync::mpsc::channel::<ScanMessage>();
let duplicates = find_duplicate_files_with_progress(&all_dirs_cli, tx)?;
let cross_dir_duplicates: Vec<DuplicateSet> = duplicates
.into_iter()
.filter(|set| {
let has_source_file = set.files.iter().any(|file| {
source_dirs
.iter()
.any(|source_dir| file.path.starts_with(source_dir))
});
let has_target_file = set
.files
.iter()
.any(|file| file.path.starts_with(&target_dir));
has_source_file && has_target_file
})
.collect();
all_duplicate_sets = cross_dir_duplicates;
log::info!(
"Found {} duplicate sets spanning source and target directories",
all_duplicate_sets.len()
);
}
Ok(DirectoryComparisonResult {
missing_in_target: missing_files,
duplicates: all_duplicate_sets,
})
}
fn scan_directory(cli: &Cli, directory: &Path) -> Result<Vec<FileInfo>> {
let filter_rules = FilterRules::new(cli)?;
let mut files = Vec::new();
let walker = WalkDir::new(directory).into_iter();
for entry in walker
.filter_entry(|e| {
if is_hidden(e) || is_symlink(e) {
return false;
}
if let Some(path_str) = e.path().to_str() {
filter_rules.is_match(path_str)
} else {
false
}
})
.flatten()
{
if entry.file_type().is_file() {
let path = entry.path().to_path_buf();
match fs::metadata(&path) {
Ok(metadata) => {
if metadata.len() > 0 {
let size = metadata.len();
let hash = match calculate_hash(&path, &cli.algorithm) {
Ok(h) => Some(h),
Err(e) => {
log::warn!("Failed to hash file {:?}: {}", path, e);
None
}
};
let file_info = FileInfo {
path,
size,
hash,
modified_at: metadata.modified().ok(),
created_at: metadata.created().ok(),
};
files.push(file_info);
}
}
Err(e) => log::warn!("Failed to get metadata for {:?}: {}", path, e),
}
}
}
log::info!("Found {} files in directory: {:?}", files.len(), directory);
Ok(files)
}
pub fn copy_missing_files(
missing_files: &[FileInfo],
target_dir: &Path,
dry_run: bool,
) -> Result<(usize, Vec<String>)> {
let mut count = 0;
let mut logs = Vec::new();
if !target_dir.exists() {
if dry_run {
let msg = format!(
"[DRY RUN] Target directory {} does not exist. Would create it.",
target_dir.display()
);
log::info!("{}", msg);
logs.push(msg);
} else {
log::info!(
"Target directory {:?} does not exist. Creating it.",
target_dir
);
fs::create_dir_all(target_dir)?;
logs.push(format!(
"Created target directory: {}",
target_dir.display()
));
}
} else if !target_dir.is_dir() {
return Err(anyhow::anyhow!(
"Target path {:?} exists but is not a directory.",
target_dir
));
}
if dry_run {
logs.push(format!(
"[DRY RUN] Would copy {} missing files to {}",
missing_files.len(),
target_dir.display()
));
for file in missing_files {
let relative_path = match file
.path
.strip_prefix(file.path.parent().unwrap().parent().unwrap())
{
Ok(rel) => rel.to_path_buf(),
Err(_) => {
PathBuf::from(file.path.file_name().unwrap_or_default())
}
};
let target_path = target_dir.join(relative_path);
logs.push(format!(
"[DRY RUN] Would copy {} to {}",
file.path.display(),
target_path.display()
));
log::info!("[DRY RUN] Would copy {:?} to {:?}", file.path, target_path);
count += 1;
}
} else {
logs.push(format!(
"Copying {} missing files to {}",
missing_files.len(),
target_dir.display()
));
for file in missing_files {
let relative_path = match file
.path
.strip_prefix(file.path.parent().unwrap().parent().unwrap())
{
Ok(rel) => rel.to_path_buf(),
Err(_) => {
PathBuf::from(file.path.file_name().unwrap_or_default())
}
};
let target_path = target_dir.join(relative_path);
if let Some(parent) = target_path.parent() {
if !parent.exists() {
fs::create_dir_all(parent)?;
let msg = format!("Created parent directory: {}", parent.display());
logs.push(msg.clone());
log::debug!("{}", msg);
}
}
match fs::copy(&file.path, &target_path) {
Ok(_) => {
let msg = format!(
"Copied: {} -> {}",
file.path.display(),
target_path.display()
);
logs.push(msg.clone());
log::info!("{}", msg);
count += 1;
}
Err(e) => {
let error_msg = format!(
"Failed to copy {} to {}: {}",
file.path.display(),
target_path.display(),
e
);
logs.push(error_msg.clone());
log::error!("{}", error_msg);
}
}
}
}
Ok((count, logs))
}
pub fn count_files_in_directory(directory: &Path, filter_rules: &FilterRules) -> Result<usize> {
let mut count = 0;
let walker = WalkDir::new(directory).into_iter();
for entry in walker
.filter_entry(|e| {
if is_hidden(e) || is_symlink(e) {
return false;
}
if let Some(path_str) = e.path().to_str() {
filter_rules.is_match(path_str)
} else {
false
}
})
.flatten()
{
if entry.file_type().is_file() {
count += 1;
}
}
Ok(count)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn create_test_file(content: &[u8]) -> NamedTempFile {
let mut file = NamedTempFile::new().unwrap();
file.write_all(content).unwrap();
file
}
#[test]
fn test_md5_hash() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "md5").unwrap();
assert_eq!(hash, "9e107d9d372bb6826bd81d3542a419d6");
}
#[test]
fn test_sha1_hash() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "sha1").unwrap();
assert_eq!(hash, "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12");
}
#[test]
fn test_sha256_hash() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "sha256").unwrap();
assert_eq!(
hash,
"d7a8fbb307d7809469ca9abcb0082e4f8d5651e46d3cdb762d02d0bf37c9e592"
);
}
#[test]
fn test_blake3_hash() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "blake3").unwrap();
assert_eq!(hash.len(), 64);
assert_eq!(
hash,
"2f1514181aadccd913abd94cfa592701a5686ab23f8df1dff1b74710febc6d4a"
);
}
#[test]
fn test_xxhash() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "xxhash").unwrap();
assert_eq!(hash.len(), 16);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
}
#[cfg(feature = "linux")]
#[test]
fn test_gxhash() {
let file = create_test_file(b"test content");
let hash = calculate_hash(file.path(), "gxhash").unwrap();
assert_eq!(hash.len(), 16); }
#[test]
fn test_fnv1a() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "fnv1a").unwrap();
assert_eq!(hash.len(), 16);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn test_crc32() {
let test_content = b"The quick brown fox jumps over the lazy dog";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "crc32").unwrap();
assert_eq!(hash.len(), 8);
assert!(hash.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn test_invalid_algorithm() {
let test_content = b"test content";
let file = create_test_file(test_content);
let result = calculate_hash(file.path(), "invalid_algorithm");
assert!(result.is_err());
}
#[test]
fn test_empty_file() {
let test_content = b"";
let file = create_test_file(test_content);
let hash = calculate_hash(file.path(), "md5").unwrap();
assert_eq!(hash, "d41d8cd98f00b204e9800998ecf8427e");
let hash = calculate_hash(file.path(), "sha1").unwrap();
assert_eq!(hash, "da39a3ee5e6b4b0d3255bfef95601890afd80709");
let hash = calculate_hash(file.path(), "sha256").unwrap();
assert_eq!(
hash,
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
);
let hash = calculate_hash(file.path(), "blake3").unwrap();
let expected_empty_blake3 = hash.clone();
assert_eq!(hash, expected_empty_blake3);
}
}