use std::fs::Metadata;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::SystemTime;
use ignore::gitignore::{Gitignore, GitignoreBuilder};
use jwalk::WalkDir;
use super::hardlink::HardlinkTracker;
use super::{FileEntry, ScanError, WalkerConfig};
#[derive(Debug)]
pub struct Walker {
root: PathBuf,
config: WalkerConfig,
shutdown_flag: Option<Arc<AtomicBool>>,
}
impl Walker {
#[must_use]
pub fn new(path: &Path, config: WalkerConfig) -> Self {
Self {
root: path.to_path_buf(),
config,
shutdown_flag: None,
}
}
#[must_use]
pub fn with_shutdown_flag(mut self, flag: Arc<AtomicBool>) -> Self {
self.shutdown_flag = Some(flag);
self
}
fn is_shutdown_requested(&self) -> bool {
self.shutdown_flag
.as_ref()
.is_some_and(|f| f.load(Ordering::SeqCst))
}
fn build_gitignore(&self) -> Option<Gitignore> {
let mut builder = GitignoreBuilder::new(&self.root);
let gitignore_path = self.root.join(".gitignore");
if gitignore_path.exists() {
if let Some(e) = builder.add(&gitignore_path) {
log::warn!(
"Failed to load .gitignore from {}: {}",
gitignore_path.display(),
e
);
} else {
log::debug!("Loaded .gitignore from {}", gitignore_path.display());
}
}
for pattern in &self.config.ignore_patterns {
if let Err(e) = builder.add_line(None, pattern) {
log::warn!("Invalid ignore pattern '{}': {}", pattern, e);
}
}
match builder.build() {
Ok(gitignore) => {
if gitignore.is_empty() {
None
} else {
Some(gitignore)
}
}
Err(e) => {
log::warn!("Failed to build ignore patterns: {}", e);
None
}
}
}
fn should_ignore(&self, path: &Path, is_dir: bool, gitignore: &Option<Gitignore>) -> bool {
if let Some(gi) = gitignore {
let relative_path = path.strip_prefix(&self.root).unwrap_or(path);
let path_str = relative_path.to_string_lossy();
let normalized_path = if cfg!(windows) {
path_str.replace('\\', "/")
} else {
path_str.into_owned()
};
gi.matched(normalized_path, is_dir).is_ignore()
} else {
false
}
}
fn passes_size_filter(&self, size: u64) -> bool {
if let Some(min) = self.config.min_size {
if size < min {
return false;
}
}
if let Some(max) = self.config.max_size {
if size > max {
return false;
}
}
true
}
fn passes_date_filter(&self, modified: SystemTime) -> bool {
if let Some(newer_than) = self.config.newer_than {
if modified < newer_than {
return false;
}
}
if let Some(older_than) = self.config.older_than {
if modified > older_than {
return false;
}
}
true
}
fn passes_regex_filter(&self, path: &Path) -> bool {
let filename = path
.file_name()
.map(|n| n.to_string_lossy())
.unwrap_or_default();
if !self.config.regex_include.is_empty() {
let mut matched = false;
for re in &self.config.regex_include {
if re.is_match(&filename) {
matched = true;
break;
}
}
if !matched {
return false;
}
}
for re in &self.config.regex_exclude {
if re.is_match(&filename) {
return false;
}
}
true
}
fn passes_file_type_filter(&self, path: &Path) -> bool {
if self.config.file_categories.is_empty() {
return true;
}
let extension = path
.extension()
.and_then(|s| s.to_str())
.map(|s| s.to_lowercase())
.unwrap_or_default();
for category in &self.config.file_categories {
if category.extensions().contains(&extension.as_str()) {
return true;
}
}
false
}
pub fn walk(&self) -> impl Iterator<Item = Result<FileEntry, ScanError>> + '_ {
let gitignore = self.build_gitignore();
let mut hardlink_tracker = HardlinkTracker::new();
let walk_dir = WalkDir::new(&self.root)
.follow_links(self.config.follow_symlinks)
.skip_hidden(self.config.skip_hidden)
.process_read_dir(move |_depth, _path, _read_dir_state, children| {
children.sort_by(|a, b| match (a, b) {
(Ok(a), Ok(b)) => a.file_name().cmp(b.file_name()),
(Ok(_), Err(_)) => std::cmp::Ordering::Less,
(Err(_), Ok(_)) => std::cmp::Ordering::Greater,
(Err(_), Err(_)) => std::cmp::Ordering::Equal,
});
});
walk_dir.into_iter().filter_map(move |entry_result| {
if self.is_shutdown_requested() {
log::debug!("Walker: Shutdown requested, stopping iteration");
return None;
}
match entry_result {
Ok(entry) => {
let path = entry.path();
if path == self.root {
return None;
}
let file_type = entry.file_type();
if file_type.is_dir() {
if self.should_ignore(&path, true, &gitignore) {
log::trace!("Ignoring directory: {}", path.display());
}
return None;
}
if self.should_ignore(&path, false, &gitignore) {
log::trace!("Ignoring file: {}", path.display());
return None;
}
let is_symlink = file_type.is_symlink();
if is_symlink && !self.config.follow_symlinks {
log::trace!("Skipping symlink: {}", path.display());
return None;
}
let metadata = if self.config.follow_symlinks {
std::fs::metadata(&path)
} else {
std::fs::symlink_metadata(&path)
};
let metadata = match metadata {
Ok(m) => m,
Err(e) => {
return Some(self.handle_io_error(&path, e));
}
};
if !metadata.is_file() {
return None;
}
self.process_file_entry(
path,
metadata,
is_symlink,
&mut hardlink_tracker,
&gitignore,
)
}
Err(e) => {
let path = e
.path()
.map_or_else(|| self.root.clone(), std::borrow::ToOwned::to_owned);
Some(self.handle_jwalk_error(path, e))
}
}
})
}
fn process_file_entry(
&self,
path: PathBuf,
metadata: Metadata,
is_symlink: bool,
hardlink_tracker: &mut HardlinkTracker,
_gitignore: &Option<Gitignore>,
) -> Option<Result<FileEntry, ScanError>> {
let size = metadata.len();
if size == 0 {
log::debug!("Skipping empty file: {}", path.display());
return None;
}
if !self.passes_size_filter(size) {
log::trace!(
"Skipping file due to size filter ({}): {}",
size,
path.display()
);
return None;
}
let modified = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH);
if !self.passes_date_filter(modified) {
log::trace!("Skipping file due to date filter: {}", path.display());
return None;
}
if !self.passes_regex_filter(&path) {
log::trace!("Skipping file due to regex filter: {}", path.display());
return None;
}
if !self.passes_file_type_filter(&path) {
log::trace!("Skipping file due to file type filter: {}", path.display());
return None;
}
if hardlink_tracker.is_hardlink(&metadata) {
log::debug!("Skipping hardlink: {}", path.display());
return None;
}
Some(Ok(FileEntry {
path,
size,
modified,
is_symlink,
is_hardlink: false,
}))
}
fn handle_io_error(&self, path: &Path, error: std::io::Error) -> Result<FileEntry, ScanError> {
use std::io::ErrorKind;
match error.kind() {
ErrorKind::PermissionDenied => {
log::warn!("Permission denied: {}", path.display());
Err(ScanError::PermissionDenied(path.to_path_buf()))
}
ErrorKind::NotFound => {
log::debug!("File not found (may have been deleted): {}", path.display());
Err(ScanError::NotFound(path.to_path_buf()))
}
_ => {
log::warn!("I/O error for {}: {}", path.display(), error);
Err(ScanError::Io {
path: path.to_path_buf(),
source: error,
})
}
}
}
fn handle_jwalk_error(
&self,
path: PathBuf,
error: jwalk::Error,
) -> Result<FileEntry, ScanError> {
log::warn!("Walker error for {}: {}", path.display(), error);
Err(ScanError::Io {
path,
source: std::io::Error::other(error.to_string()),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::{self, File};
use std::io::Write;
use tempfile::TempDir;
fn create_test_dir() -> TempDir {
let dir = TempDir::new().unwrap();
let file1 = dir.path().join("file1.txt");
let mut f = File::create(&file1).unwrap();
writeln!(f, "Hello, world!").unwrap();
let file2 = dir.path().join("file2.txt");
let mut f = File::create(&file2).unwrap();
writeln!(f, "Another file").unwrap();
let subdir = dir.path().join("subdir");
fs::create_dir(&subdir).unwrap();
let file3 = subdir.join("nested.txt");
let mut f = File::create(&file3).unwrap();
writeln!(f, "Nested file content").unwrap();
dir
}
#[test]
fn test_walker_finds_files() {
let dir = create_test_dir();
let walker = Walker::new(dir.path(), WalkerConfig::default());
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 3);
for file in &files {
assert!(file.size > 0);
assert!(file.path.exists());
assert!(!file.is_symlink);
}
}
#[test]
fn test_walker_min_size_filter() {
let dir = create_test_dir();
let tiny_file = dir.path().join("tiny.txt");
let mut f = File::create(&tiny_file).unwrap();
f.write_all(b"X").unwrap();
let config = WalkerConfig {
min_size: Some(10), ..Default::default()
};
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
for file in &files {
assert!(
file.size >= 10,
"File {} has size {}",
file.path.display(),
file.size
);
}
}
#[test]
fn test_walker_max_size_filter() {
let dir = create_test_dir();
let large_file = dir.path().join("large.txt");
let mut f = File::create(&large_file).unwrap();
for _ in 0..1000 {
writeln!(f, "This is a line of text to make the file larger.").unwrap();
}
let config = WalkerConfig {
max_size: Some(100), ..Default::default()
};
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
for file in &files {
assert!(
file.size <= 100,
"File {} has size {}",
file.path.display(),
file.size
);
}
}
#[test]
fn test_walker_skip_empty_files() {
let dir = create_test_dir();
File::create(dir.path().join("empty.txt")).unwrap();
let walker = Walker::new(dir.path(), WalkerConfig::default());
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
for file in &files {
assert!(file.size > 0);
}
}
#[test]
fn test_walker_skip_hidden_files() {
let dir = create_test_dir();
let hidden_file = dir.path().join(".hidden");
let mut f = File::create(&hidden_file).unwrap();
writeln!(f, "Hidden content").unwrap();
let config = WalkerConfig {
skip_hidden: true,
..Default::default()
};
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
for file in &files {
assert!(!file
.path
.file_name()
.unwrap()
.to_str()
.unwrap()
.starts_with('.'));
}
}
#[test]
fn test_walker_ignore_patterns() {
let dir = create_test_dir();
let tmp_file = dir.path().join("temp.tmp");
let mut f = File::create(&tmp_file).unwrap();
writeln!(f, "Temporary file").unwrap();
let log_file = dir.path().join("debug.log");
let mut f = File::create(&log_file).unwrap();
writeln!(f, "Log content").unwrap();
let config = WalkerConfig {
ignore_patterns: vec!["*.tmp".to_string(), "*.log".to_string()],
..Default::default()
};
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
for file in &files {
let name = file.path.file_name().unwrap().to_str().unwrap();
assert!(!name.ends_with(".tmp"), "Should skip .tmp files");
assert!(!name.ends_with(".log"), "Should skip .log files");
}
}
#[test]
fn test_walker_date_filters() {
use chrono::{TimeZone, Utc};
let dir = TempDir::new().unwrap();
let past_file = dir.path().join("past.txt");
let mut f = File::create(&past_file).unwrap();
writeln!(f, "past content").unwrap();
let past_time = Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap();
filetime::set_file_mtime(
&past_file,
filetime::FileTime::from_system_time(past_time.into()),
)
.unwrap();
let recent_file = dir.path().join("recent.txt");
let mut f = File::create(&recent_file).unwrap();
writeln!(f, "recent content").unwrap();
let recent_time = Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap();
filetime::set_file_mtime(
&recent_file,
filetime::FileTime::from_system_time(recent_time.into()),
)
.unwrap();
let config = WalkerConfig::default().with_newer_than(Some(
Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap().into(),
));
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "recent.txt");
let config = WalkerConfig::default().with_older_than(Some(
Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap().into(),
));
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "past.txt");
}
#[test]
fn test_walker_combined_date_filters() {
use chrono::{TimeZone, Utc};
let dir = TempDir::new().unwrap();
let past_file = dir.path().join("past.txt");
let mut f = File::create(&past_file).unwrap();
writeln!(f, "past content").unwrap();
let past_time = Utc.with_ymd_and_hms(2020, 1, 1, 0, 0, 0).unwrap();
filetime::set_file_mtime(
&past_file,
filetime::FileTime::from_system_time(past_time.into()),
)
.unwrap();
let mid_file = dir.path().join("mid.txt");
let mut f = File::create(&mid_file).unwrap();
writeln!(f, "mid content").unwrap();
let mid_time = Utc.with_ymd_and_hms(2023, 1, 1, 0, 0, 0).unwrap();
filetime::set_file_mtime(
&mid_file,
filetime::FileTime::from_system_time(mid_time.into()),
)
.unwrap();
let recent_file = dir.path().join("recent.txt");
let mut f = File::create(&recent_file).unwrap();
writeln!(f, "recent content").unwrap();
let recent_time = Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap();
filetime::set_file_mtime(
&recent_file,
filetime::FileTime::from_system_time(recent_time.into()),
)
.unwrap();
let config = WalkerConfig::default()
.with_newer_than(Some(
Utc.with_ymd_and_hms(2021, 1, 1, 0, 0, 0).unwrap().into(),
))
.with_older_than(Some(
Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap().into(),
));
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "mid.txt");
}
#[test]
fn test_walker_multiple_regex_include() {
use regex::Regex;
let dir = create_test_dir();
let config = WalkerConfig::default().with_regex_include(vec![
Regex::new("file1").unwrap(),
Regex::new("nested").unwrap(),
]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 2);
let names: Vec<_> = files
.iter()
.map(|f| f.path.file_name().unwrap().to_str().unwrap())
.collect();
assert!(names.contains(&"file1.txt"));
assert!(names.contains(&"nested.txt"));
}
#[test]
fn test_walker_regex_filters() {
use regex::Regex;
let dir = create_test_dir();
let config = WalkerConfig::default().with_regex_include(vec![Regex::new("file1").unwrap()]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "file1.txt");
let config = WalkerConfig::default().with_regex_exclude(vec![Regex::new("file2").unwrap()]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 2);
let names: Vec<_> = files
.iter()
.map(|f| f.path.file_name().unwrap().to_str().unwrap())
.collect();
assert!(names.contains(&"file1.txt"));
assert!(names.contains(&"nested.txt"));
assert!(!names.contains(&"file2.txt"));
let config = WalkerConfig::default()
.with_regex_include(vec![Regex::new("file").unwrap()])
.with_regex_exclude(vec![Regex::new("1").unwrap()]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "file2.txt");
}
#[test]
fn test_walker_file_type_filters() {
use super::super::FileCategory;
let dir = TempDir::new().unwrap();
let image = dir.path().join("photo.jpg");
let mut f = File::create(&image).unwrap();
writeln!(f, "image content").unwrap();
let doc = dir.path().join("report.pdf");
let mut f = File::create(&doc).unwrap();
writeln!(f, "document content").unwrap();
let audio = dir.path().join("song.mp3");
let mut f = File::create(&audio).unwrap();
writeln!(f, "audio content").unwrap();
let config = WalkerConfig::default().with_file_categories(vec![FileCategory::Images]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 1);
assert_eq!(files[0].path.file_name().unwrap(), "photo.jpg");
let config = WalkerConfig::default()
.with_file_categories(vec![FileCategory::Documents, FileCategory::Audio]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 2);
let names: Vec<_> = files
.iter()
.map(|f| f.path.file_name().unwrap().to_str().unwrap())
.collect();
assert!(names.contains(&"report.pdf"));
assert!(names.contains(&"song.mp3"));
let config = WalkerConfig::default().with_file_categories(vec![FileCategory::Videos]);
let walker = Walker::new(dir.path(), config);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert_eq!(files.len(), 0);
}
#[test]
fn test_walker_shutdown_flag() {
let dir = create_test_dir();
for i in 0..10 {
let file = dir.path().join(format!("file{}.txt", i));
let mut f = File::create(&file).unwrap();
writeln!(f, "Content {}", i).unwrap();
}
let shutdown = Arc::new(AtomicBool::new(false));
let walker = Walker::new(dir.path(), WalkerConfig::default())
.with_shutdown_flag(Arc::clone(&shutdown));
shutdown.store(true, Ordering::SeqCst);
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
assert!(
files.len() < 5,
"Expected early termination, got {} files",
files.len()
);
}
#[test]
fn test_walker_handles_nonexistent_path() {
let walker = Walker::new(
Path::new("/nonexistent/path/12345"),
WalkerConfig::default(),
);
let results: Vec<_> = walker.walk().collect();
assert!(results.is_empty() || results.iter().all(|r| r.is_err()));
}
#[test]
#[cfg(unix)]
fn test_walker_detects_hardlinks() {
use std::fs::hard_link;
let dir = create_test_dir();
let original = dir.path().join("original.txt");
let mut f = File::create(&original).unwrap();
writeln!(f, "Original content").unwrap();
let hardlink = dir.path().join("hardlink.txt");
hard_link(&original, &hardlink).unwrap();
let walker = Walker::new(dir.path(), WalkerConfig::default());
let files: Vec<_> = walker.walk().filter_map(Result::ok).collect();
let matching: Vec<_> = files
.iter()
.filter(|f| {
f.path
.file_name()
.map_or(false, |n| n == "original.txt" || n == "hardlink.txt")
})
.collect();
assert_eq!(
matching.len(),
1,
"Only one hardlink file should be included"
);
}
#[test]
fn test_file_entry_fields() {
let dir = create_test_dir();
let walker = Walker::new(dir.path(), WalkerConfig::default());
let file = walker.walk().filter_map(Result::ok).next().unwrap();
assert!(!file.path.as_os_str().is_empty());
assert!(file.size > 0);
assert!(file.modified != SystemTime::UNIX_EPOCH);
assert!(!file.is_symlink);
}
}