msy 0.4.6 - Docs.rs

use crate::error::{Result, SyncError};
use crossbeam_channel::{Receiver, bounded};
use ignore::{WalkBuilder, WalkState};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::SystemTime;

#[cfg(unix)]
use std::os::unix::fs::MetadataExt;

#[cfg(target_os = "macos")]
use std::os::darwin::fs::MetadataExt as DarwinMetadataExt;

#[derive(Debug, Clone)]
pub struct FileEntry {
	pub path: Arc<PathBuf>,
	pub relative_path: Arc<PathBuf>,
	pub size: u64,
	pub modified: SystemTime,
	pub is_dir: bool,
	pub is_symlink: bool,
	pub symlink_target: Option<Arc<PathBuf>>,
	#[allow(dead_code)] // Used for sparse file detection
	pub is_sparse: bool,
	#[allow(dead_code)] // Used for sparse file optimization
	pub allocated_size: u64, // Actual bytes allocated on disk
	pub xattrs: Option<HashMap<String, Vec<u8>>>, // Extended attributes (if enabled)
	pub inode: Option<u64>,                       // Inode number (Unix only)
	pub nlink: u64,                               // Number of hard links to this file
	pub acls: Option<Vec<u8>>,                    // Serialized ACLs (if enabled)
	#[cfg_attr(not(target_os = "macos"), allow(dead_code))] // Only read on macOS
	pub bsd_flags: Option<u32>, // BSD file flags (hidden, immutable, etc.) - macOS only, None on other platforms
}

/// Detect if a file is sparse and get its allocated size
/// Returns (is_sparse, allocated_size)
#[cfg(unix)]
fn detect_sparse_file(_path: &Path, metadata: &std::fs::Metadata) -> (bool, u64) {
	// Get the number of 512-byte blocks allocated
	let blocks = metadata.blocks();
	let file_size = metadata.len();

	// Calculate actual allocated bytes (blocks are always 512 bytes on Unix)
	let allocated_size = blocks * 512;

	// A file is sparse if it uses significantly fewer blocks than its size would suggest
	// We use a threshold of 4KB (8 blocks) to account for filesystem overhead
	let threshold = 4096;
	let is_sparse = file_size > threshold && allocated_size < file_size.saturating_sub(threshold);

	(is_sparse, allocated_size)
}

/// Non-Unix platforms don't support sparse file detection
#[cfg(not(unix))]
fn detect_sparse_file(_path: &Path, metadata: &std::fs::Metadata) -> (bool, u64) {
	// On non-Unix platforms, assume not sparse and allocated size equals file size
	let file_size = metadata.len();
	(false, file_size)
}

/// Detect hardlink information (inode number and link count)
/// Returns (inode, nlink)
#[cfg(unix)]
fn detect_hardlink_info(metadata: &std::fs::Metadata) -> (Option<u64>, u64) {
	let inode = metadata.ino();
	let nlink = metadata.nlink();
	(Some(inode), nlink)
}

/// Non-Unix platforms don't support inode-based hardlink detection
#[cfg(not(unix))]
fn detect_hardlink_info(_metadata: &std::fs::Metadata) -> (Option<u64>, u64) {
	(None, 1)
}

/// Read extended attributes from a file
/// Returns None if xattrs are not supported or if reading fails
#[cfg(unix)]
fn read_xattrs(path: &Path) -> Option<HashMap<String, Vec<u8>>> {
	let mut xattrs = HashMap::new();

	// List all xattr names
	let names = match xattr::list(path) {
		Ok(names) => names,
		Err(_) => return None, // No xattrs or not supported
	};

	for name in names {
		if let Ok(Some(value)) = xattr::get(path, &name)
			&& let Some(name_str) = name.to_str()
		{
			xattrs.insert(name_str.to_string(), value);
		}
	}

	if xattrs.is_empty() { None } else { Some(xattrs) }
}

/// Non-Unix platforms don't support extended attributes
#[cfg(not(unix))]
fn read_xattrs(_path: &Path) -> Option<HashMap<String, Vec<u8>>> {
	None
}

/// Read ACLs from a file
/// Returns None if ACLs are not supported or if reading fails
/// The ACLs are stored as text representation (Display format) for portability
#[cfg(all(unix, feature = "acl"))]
fn read_acls(path: &Path) -> Option<Vec<u8>> {
	use exacl::getfacl;

	// Read ACLs from file
	match getfacl(path, None) {
		Ok(acls) => {
			let acl_vec: Vec<_> = acls.into_iter().collect();
			if acl_vec.is_empty() {
				return None;
			}

			// Convert ACLs to standard text format using Display trait
			// This produces parseable text like "user::rwx", "group::r-x", etc.
			let acl_text: Vec<String> = acl_vec.iter().map(|e| format!("{}", e)).collect();
			let joined = acl_text.join("\n");

			if joined.is_empty() { None } else { Some(joined.into_bytes()) }
		}
		Err(_) => None, // No ACLs or not supported
	}
}

/// ACLs not available
/// - Feature 'acl' is disabled, or
/// - Platform doesn't support ACLs (non-Unix)
#[cfg(not(all(unix, feature = "acl")))]
fn read_acls(_path: &Path) -> Option<Vec<u8>> {
	None
}

/// Read BSD file flags (macOS only)
/// Returns None if not supported or if reading fails
#[cfg(target_os = "macos")]
fn read_bsd_flags(metadata: &std::fs::Metadata) -> Option<u32> {
	Some(metadata.st_flags())
}

/// Non-macOS platforms don't support BSD file flags
#[cfg(not(target_os = "macos"))]
fn read_bsd_flags(_metadata: &std::fs::Metadata) -> Option<u32> {
	None
}

#[derive(Debug, Clone, Copy)]
pub struct ScanOptions {
	pub respect_gitignore: bool,
	pub include_git_dir: bool,
}

impl Default for ScanOptions {
	fn default() -> Self {
		Self { respect_gitignore: false, include_git_dir: true }
	}
}

/// Optimal thread count for parallel scanning
/// Benchmarks show 4 threads is the sweet spot - more threads add overhead
/// without proportional benefit due to I/O bottlenecks
fn optimal_thread_count() -> usize {
	std::cmp::min(4, num_cpus::get())
}

/// Threshold for parallel scanning (subdirectory count)
/// Parallel scanning helps when there are many subdirectories to explore
/// Based on benchmarks: parallel wins at 50+ subdirs, loses below 25
const PARALLEL_SUBDIR_THRESHOLD: usize = 30;

/// Quick check if directory structure benefits from parallel scanning
/// Counts immediate subdirectories (parallel helps with dir traversal, not flat files)
fn should_use_parallel(root: &Path) -> bool {
	match std::fs::read_dir(root) {
		Ok(entries) => {
			let mut subdir_count = 0;
			for e in entries.flatten() {
				// Only count directories, not files
				if e.file_type().map(|t| t.is_dir()).unwrap_or(false) {
					subdir_count += 1;
					if subdir_count > PARALLEL_SUBDIR_THRESHOLD {
						return true;
					}
				}
			}
			false
		}
		Err(_) => false, // Fall back to sequential on error
	}
}

/// Process a directory entry into a FileEntry
/// Extracted to share between sequential and parallel scanners
fn process_dir_entry(root: &Path, entry: ignore::DirEntry) -> Result<FileEntry> {
	let path = entry.path().to_path_buf();

	// Use symlink_metadata to properly detect symlinks
	// entry.metadata() follows symlinks by default, making is_symlink() always false
	let metadata = std::fs::symlink_metadata(&path).map_err(|e| SyncError::ReadDirError { path: path.clone(), source: e })?;

	let relative_path = path.strip_prefix(root).map(|p| p.to_path_buf()).map_err(|_| SyncError::InvalidPath { path: path.clone() })?;

	// Check if this is a symlink
	let is_symlink = metadata.is_symlink();
	let symlink_target = if is_symlink { std::fs::read_link(&path).ok() } else { None };

	// Detect sparse files (only for regular files, not directories or symlinks)
	let (is_sparse, allocated_size) = if !metadata.is_dir() && !is_symlink { detect_sparse_file(&path, &metadata) } else { (false, 0) };

	// Detect hardlink information (inode and link count)
	let (inode, nlink) = detect_hardlink_info(&metadata);

	// Read extended attributes (always scan them, writing is conditional)
	let xattrs = read_xattrs(&path);

	// Read ACLs (always scan them, writing is conditional)
	let acls = read_acls(&path);

	// Read BSD file flags (macOS only, None on other platforms)
	let bsd_flags = read_bsd_flags(&metadata);

	let modified = metadata.modified().map_err(|e| SyncError::ReadDirError { path: path.clone(), source: e })?;

	Ok(FileEntry {
		path: Arc::new(path),
		relative_path: Arc::new(relative_path),
		size: metadata.len(),
		modified,
		is_dir: metadata.is_dir(),
		is_symlink,
		symlink_target: symlink_target.map(Arc::new),
		is_sparse,
		allocated_size,
		xattrs,
		inode,
		nlink,
		acls,
		bsd_flags,
	})
}

pub struct Scanner {
	root: PathBuf,
	threads: usize,
	follow_links: bool,
	options: ScanOptions,
	/// When true, dynamically choose parallel vs sequential based on directory size
	/// When false (explicit thread count), use parallel if threads > 1
	auto_select: bool,
}

impl Scanner {
	/// Create a new scanner with automatic optimization
	///
	/// Uses optimal thread count (capped at 4) and dynamically chooses
	/// parallel vs sequential based on directory size.
	pub fn new(root: impl Into<PathBuf>) -> Self {
		Self {
			root: root.into(),
			threads: optimal_thread_count(),
			follow_links: false,
			options: ScanOptions::default(),
			auto_select: true,
		}
	}

	/// Create a scanner with a specific number of threads for parallel scanning
	///
	/// Use 0 or 1 for single-threaded operation, or specify thread count.
	/// Bypasses automatic directory size detection - always uses parallel if threads > 1.
	#[allow(dead_code)] // Public API for custom thread control
	pub fn with_threads(root: impl Into<PathBuf>, threads: usize) -> Self {
		Self { root: root.into(), threads, follow_links: false, options: ScanOptions::default(), auto_select: false }
	}

	/// Enable following symbolic links during directory traversal
	///
	/// When enabled, symbolic links to directories will be followed and their
	/// contents will be scanned. Loop detection is automatic (via walkdir crate)
	/// and will report an error if a symlink loop is detected.
	///
	/// Default: false (symlinks are recorded but not followed)
	#[allow(dead_code)] // Public API for symlink following control
	pub fn follow_links(mut self, follow: bool) -> Self {
		self.follow_links = follow;
		self
	}

	/// Set scanning options
	pub fn with_options(mut self, options: ScanOptions) -> Self {
		self.options = options;
		self
	}

	/// Set whether to respect .gitignore files
	#[allow(dead_code)] // Public API
	pub fn respect_gitignore(mut self, respect: bool) -> Self {
		self.options.respect_gitignore = respect;
		self
	}

	/// Set whether to include .git directories
	#[allow(dead_code)] // Public API
	pub fn include_git_dir(mut self, include: bool) -> Self {
		self.options.include_git_dir = include;
		self
	}

	/// Scan and return all entries at once (legacy API, kept for compatibility)
	///
	/// For large directories (>100k files), consider using `scan_streaming()` instead
	pub fn scan(&self) -> Result<Vec<FileEntry>> {
		// For backward compatibility, collect streaming results into Vec
		self.scan_streaming()?.collect()
	}

	/// Streaming scan that yields FileEntry one at a time
	///
	/// This is memory-efficient for large directories as it doesn't load
	/// all entries into memory at once. Memory usage is O(1) regardless
	/// of directory size.
	///
	/// Uses parallel directory walking if threads > 1, which can provide
	/// 2-4x speedup on directories with many subdirectories.
	///
	/// # Example
	/// ```ignore
	/// let scanner = Scanner::new("/large/directory");
	/// for entry in scanner.scan_streaming()? {
	///     let entry = entry?;
	///     println!("{}", entry.path.display());
	/// }
	/// ```
	pub fn scan_streaming(&self) -> Result<Box<dyn Iterator<Item = Result<FileEntry>> + Send>> {
		let mut walker = WalkBuilder::new(&self.root);
		walker
			.hidden(false) // Don't skip hidden files by default
			.git_ignore(self.options.respect_gitignore) // Respect .gitignore (in git repos)
			.git_global(self.options.respect_gitignore) // Respect global gitignore
			.git_exclude(self.options.respect_gitignore) // Respect .git/info/exclude
			.threads(self.threads) // Parallel walking if threads > 1
			.follow_links(self.follow_links); // Follow symlinks with automatic loop detection

		if !self.options.include_git_dir {
			walker.filter_entry(|entry| {
				// Skip .git directories
				entry.file_name() != ".git"
			});
		}

		// Also respect .gitignore files even outside git repos
		// This allows .gitignore to work in non-git directories
		if self.options.respect_gitignore {
			let gitignore_path = self.root.join(".gitignore");
			if gitignore_path.exists() {
				walker.add_ignore(&gitignore_path);
			}
		}

		// Determine whether to use parallel scanning
		// - If auto_select: check directory size (parallel has overhead for small dirs)
		// - If explicit threads: respect user's choice
		let use_parallel = if self.auto_select {
			// Dynamic: only use parallel if directory is large enough
			// Parallel has ~0.7ms overhead but saves 6-22ms on large directories
			self.threads > 1 && should_use_parallel(&self.root)
		} else {
			// Explicit: user knows what they want
			self.threads > 1
		};

		if use_parallel {
			Ok(Box::new(ParallelStreamingScanner::new(self.root.clone(), walker.build_parallel())))
		} else {
			Ok(Box::new(StreamingScanner { root: self.root.clone(), walker: walker.build() }))
		}
	}
}

/// Sequential streaming iterator over FileEntry items
///
/// This iterator processes files one at a time, making it suitable for
/// very large directories (millions of files) without consuming excessive memory.
pub struct StreamingScanner {
	root: PathBuf,
	walker: ignore::Walk,
}

impl Iterator for StreamingScanner {
	type Item = Result<FileEntry>;

	fn next(&mut self) -> Option<Self::Item> {
		loop {
			let result = self.walker.next()?;

			let entry = match result {
				Ok(entry) => entry,
				Err(e) => return Some(Err(SyncError::Io(std::io::Error::other(e.to_string())))),
			};

			// Skip the root directory itself
			if entry.path() == self.root {
				continue;
			}

			return Some(process_dir_entry(&self.root, entry));
		}
	}
}

// StreamingScanner is Send because it only contains Send types
unsafe impl Send for StreamingScanner {}

/// Parallel streaming iterator over FileEntry items
///
/// Uses multiple threads to scan directories in parallel, providing 2-4x speedup
/// on directories with many subdirectories. Results are delivered through a channel.
pub struct ParallelStreamingScanner {
	receiver: Receiver<Result<FileEntry>>,
	// Handle kept to ensure walker thread completes
	_handle: Option<std::thread::JoinHandle<()>>,
}

impl ParallelStreamingScanner {
	fn new(root: PathBuf, walker: ignore::WalkParallel) -> Self {
		// Bounded channel prevents memory blowup if consumer is slow
		let (sender, receiver) = bounded(1024);

		let handle = std::thread::spawn(move || {
			walker.run(|| {
				let sender = sender.clone();
				let root = root.clone();
				Box::new(move |result| {
					match result {
						Ok(entry) => {
							// Skip the root directory itself
							if entry.path() == root {
								return WalkState::Continue;
							}

							let file_entry = process_dir_entry(&root, entry);
							// If send fails, receiver dropped - stop walking
							if sender.send(file_entry).is_err() {
								return WalkState::Quit;
							}
						}
						Err(e) => {
							let err = SyncError::Io(std::io::Error::other(e.to_string()));
							if sender.send(Err(err)).is_err() {
								return WalkState::Quit;
							}
						}
					}
					WalkState::Continue
				})
			});
			// sender drops here when walker completes, closing channel
		});

		Self { receiver, _handle: Some(handle) }
	}
}

impl Iterator for ParallelStreamingScanner {
	type Item = Result<FileEntry>;

	fn next(&mut self) -> Option<Self::Item> {
		self.receiver.recv().ok()
	}
}

#[cfg(test)]
mod tests {
	use super::*;
	use std::fs;
	use tempfile::TempDir;

	#[test]
	fn test_scanner_basic() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create test structure
		fs::create_dir(root.join("dir1")).unwrap();
		fs::write(root.join("file1.txt"), "content").unwrap();
		fs::write(root.join("dir1/file2.txt"), "content").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert!(entries.len() >= 3); // dir1, file1.txt, dir1/file2.txt
		assert!(entries.iter().any(|e| e.relative_path.as_path() == Path::new("file1.txt")));
	}

	#[test]
	fn test_scanner_gitignore() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Initialize git repo (required for .gitignore to work)
		std::process::Command::new("git").args(["init"]).current_dir(root).output().unwrap();

		// Create .gitignore
		fs::write(root.join(".gitignore"), "ignored.txt\n").unwrap();
		fs::write(root.join("ignored.txt"), "should be ignored").unwrap();
		fs::write(root.join("included.txt"), "should be included").unwrap();

		// Use respect_gitignore: true to enable .gitignore filtering
		let scanner = Scanner::new(root).with_options(ScanOptions { respect_gitignore: true, include_git_dir: false });
		let entries = scanner.scan().unwrap();

		// ignored.txt should not appear
		assert!(!entries.iter().any(|e| e.relative_path.to_str() == Some("ignored.txt")));
		// included.txt should appear
		assert!(entries.iter().any(|e| e.relative_path.to_str() == Some("included.txt")));
	}

	#[test]
	fn test_scanner_gitignore_without_git_repo() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// NO git init - testing .gitignore without git repo

		// Create .gitignore with multiple patterns
		fs::write(root.join(".gitignore"), "*.tmp\n*.log\nnode_modules/\n.DS_Store\n").unwrap();

		// Create files matching patterns (should be ignored with respect_gitignore: true)
		fs::write(root.join("test.tmp"), "should be ignored").unwrap();
		fs::write(root.join("debug.log"), "should be ignored").unwrap();
		fs::create_dir(root.join("node_modules")).unwrap();
		fs::write(root.join("node_modules").join("package.txt"), "should be ignored").unwrap();

		// Create files NOT matching patterns (should be included)
		fs::write(root.join("normal.txt"), "should be included").unwrap();
		fs::write(root.join("important.rs"), "should be included").unwrap();

		// Use respect_gitignore: true to enable .gitignore filtering
		let scanner = Scanner::new(root).with_options(ScanOptions { respect_gitignore: true, include_git_dir: true });
		let entries = scanner.scan().unwrap();

		// Ignored files should NOT appear
		assert!(!entries.iter().any(|e| e.relative_path.to_str() == Some("test.tmp")), "test.tmp should be ignored");
		assert!(!entries.iter().any(|e| e.relative_path.to_str() == Some("debug.log")), "debug.log should be ignored");
		assert!(!entries.iter().any(|e| e.relative_path.to_str() == Some("node_modules")), "node_modules/ should be ignored");

		// Normal files SHOULD appear
		assert!(entries.iter().any(|e| e.relative_path.to_str() == Some("normal.txt")), "normal.txt should be included");
		assert!(entries.iter().any(|e| e.relative_path.to_str() == Some("important.rs")), "important.rs should be included");
	}

	#[test]
	#[cfg(unix)] // Symlinks work differently on Windows
	fn test_scanner_symlinks() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a regular file
		fs::write(root.join("target.txt"), "target content").unwrap();

		// Create a symlink to the file
		std::os::unix::fs::symlink(root.join("target.txt"), root.join("link.txt")).unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// Find the symlink entry
		let link_entry = entries.iter().find(|e| e.relative_path.as_path() == Path::new("link.txt")).expect("Symlink should be in scan results");

		assert!(link_entry.is_symlink, "Entry should be marked as symlink");
		assert!(link_entry.symlink_target.is_some(), "Symlink should have a target");

		// The target should be the absolute path to target.txt
		let target = link_entry.symlink_target.as_ref().unwrap();
		assert_eq!(&**target, &root.join("target.txt"));

		// Find the regular file entry
		let file_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("target.txt"))
			.expect("Target file should be in scan results");

		assert!(!file_entry.is_symlink, "Regular file should not be marked as symlink");
		assert!(file_entry.symlink_target.is_none(), "Regular file should have no target");
	}

	#[test]
	#[cfg(unix)] // Symlinks work differently on Windows
	fn test_scanner_symlink_loop_detection() {
		use std::os::unix::fs as unix_fs;
		use tempfile::TempDir;

		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a simple self-referencing symlink loop: a/link -> a
		let dir_a = root.join("a");
		std::fs::create_dir(&dir_a).unwrap();
		let link = dir_a.join("link");
		unix_fs::symlink(&dir_a, &link).unwrap();

		// Create a regular file in dir_a to verify we still scan it
		std::fs::write(dir_a.join("file.txt"), "test").unwrap();

		// Without follow_links, symlink is just recorded (no loop issue)
		let scanner = Scanner::new(&dir_a);
		let entries = scanner.scan().unwrap();

		// Should have 2 entries: file.txt and the symlink
		assert_eq!(entries.len(), 2);
		let symlink_entry = entries.iter().find(|e| e.is_symlink).unwrap();
		assert_eq!(*symlink_entry.relative_path, PathBuf::from("link"));

		// With follow_links enabled, walkdir detects the loop and returns an error
		let scanner = Scanner::new(&dir_a).follow_links(true);
		let result = scanner.scan();

		// The scan should either:
		// 1. Return Ok but skip the looping directory
		// 2. Return an error about the loop
		// walkdir's behavior is to skip the loop with a warning in the iterator
		match result {
			Ok(entries) => {
				// Loop was skipped, we should still have file.txt
				assert!(entries.iter().any(|e| e.path.ends_with("file.txt")));
			}
			Err(_) => {
				// Loop caused an error - also acceptable
			}
		}
	}

	#[test]
	#[cfg(unix)] // Symlinks work differently on Windows
	fn test_scanner_symlink_chain_loop() {
		use std::os::unix::fs as unix_fs;
		use tempfile::TempDir;

		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a more complex loop: a/link1 -> b, b/link2 -> a
		let dir_a = root.join("a");
		let dir_b = root.join("b");
		std::fs::create_dir(&dir_a).unwrap();
		std::fs::create_dir(&dir_b).unwrap();

		let link1 = dir_a.join("link1");
		let link2 = dir_b.join("link2");
		unix_fs::symlink(&dir_b, &link1).unwrap();
		unix_fs::symlink(&dir_a, &link2).unwrap();

		// Add files to verify scanning still works
		std::fs::write(dir_a.join("file_a.txt"), "a").unwrap();
		std::fs::write(dir_b.join("file_b.txt"), "b").unwrap();

		// Without follow_links, both symlinks are just recorded
		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// Should have: a/, b/, a/file_a.txt, b/file_b.txt, a/link1, b/link2
		assert!(entries.len() >= 4);
		assert_eq!(entries.iter().filter(|e| e.is_symlink).count(), 2);

		// With follow_links, walkdir should detect the cycle
		let scanner = Scanner::new(root).follow_links(true);
		let result = scanner.scan();

		// Should handle gracefully (either skip loop or return error)
		match result {
			Ok(entries) => {
				// Should still have both regular files
				assert!(entries.iter().any(|e| e.path.ends_with("file_a.txt")));
				assert!(entries.iter().any(|e| e.path.ends_with("file_b.txt")));
			}
			Err(_) => {
				// Loop detection error is acceptable
			}
		}
	}

	#[test]
	#[cfg(unix)] // Sparse files work differently on Windows
	fn test_scanner_sparse_files() {
		use std::io::Write;
		use std::process::Command;

		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a sparse file using dd (Unix command)
		// This ensures we get a real sparse file
		let sparse_path = root.join("sparse.dat");

		// Use dd to create a 10MB sparse file
		let output = Command::new("dd")
			.args([
				"if=/dev/zero",
				&format!("of={}", sparse_path.display()),
				"bs=1024",
				"count=0",
				"seek=10240", // Seek to 10MB
			])
			.output()
			.expect("Failed to create sparse file with dd");

		if !output.status.success() {
			panic!("dd command failed: {:?}", String::from_utf8_lossy(&output.stderr));
		}

		// Write 4KB of actual data at the beginning
		let mut file = std::fs::OpenOptions::new().write(true).open(&sparse_path).unwrap();
		let data = vec![0x42; 4096];
		file.write_all(&data).unwrap();
		file.flush().unwrap();
		drop(file);

		// The file size should be 10MB, but allocated size should be much smaller (only 4KB data written)
		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		let sparse_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("sparse.dat"))
			.expect("Sparse file should be in scan results");

		assert_eq!(sparse_entry.size, 10 * 1024 * 1024, "File size should be 10MB");

		// Note: Some filesystems (like APFS on macOS) may not create truly sparse files
		// in all situations. If the filesystem doesn't support sparse files, skip assertions.
		if sparse_entry.allocated_size < sparse_entry.size {
			// Filesystem supports sparse files - verify detection works
			assert!(
				sparse_entry.is_sparse,
				"File should be detected as sparse (size: {}, allocated: {})",
				sparse_entry.size, sparse_entry.allocated_size
			);
			assert!(
				sparse_entry.allocated_size < sparse_entry.size / 2,
				"Allocated size ({}) should be much smaller than file size ({})",
				sparse_entry.allocated_size,
				sparse_entry.size
			);
		} else {
			// Filesystem doesn't support sparse files - just verify no crash and correct detection
			assert!(!sparse_entry.is_sparse, "Non-sparse file should not be detected as sparse");
		}
	}

	#[test]
	fn test_scanner_regular_file_not_sparse() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a regular file with actual data
		let file_path = root.join("regular.txt");
		let data = vec![0x42; 10 * 1024]; // 10KB of actual data
		fs::write(&file_path, &data).unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		let regular_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("regular.txt"))
			.expect("Regular file should be in scan results");

		// Regular file should not be marked as sparse
		assert!(!regular_entry.is_sparse, "Regular file should not be detected as sparse");
		assert_eq!(regular_entry.size, 10 * 1024, "File size should be 10KB");
	}

	#[test]
	#[cfg(unix)] // Hardlinks work differently on Windows
	fn test_scanner_hardlinks() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a regular file
		let file_path = root.join("original.txt");
		fs::write(&file_path, "content").unwrap();

		// Create hardlink to the file
		let link1_path = root.join("link1.txt");
		fs::hard_link(&file_path, &link1_path).unwrap();

		// Create another hardlink
		let link2_path = root.join("link2.txt");
		fs::hard_link(&file_path, &link2_path).unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// Find all three entries
		let original_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("original.txt"))
			.expect("Original file should be in scan results");

		let link1_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("link1.txt"))
			.expect("Hardlink 1 should be in scan results");

		let link2_entry = entries
			.iter()
			.find(|e| e.relative_path.as_path() == Path::new("link2.txt"))
			.expect("Hardlink 2 should be in scan results");

		// All three should have nlink = 3
		assert_eq!(original_entry.nlink, 3, "Original should have 3 links");
		assert_eq!(link1_entry.nlink, 3, "Link1 should have 3 links");
		assert_eq!(link2_entry.nlink, 3, "Link2 should have 3 links");

		// All three should have the same inode
		assert!(original_entry.inode.is_some(), "Original should have inode");
		assert!(link1_entry.inode.is_some(), "Link1 should have inode");
		assert!(link2_entry.inode.is_some(), "Link2 should have inode");

		assert_eq!(original_entry.inode, link1_entry.inode, "Original and link1 should have same inode");
		assert_eq!(original_entry.inode, link2_entry.inode, "Original and link2 should have same inode");
	}

	#[test]
	fn test_scanner_regular_file_no_hardlinks() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a regular file with no hardlinks
		let file_path = root.join("single.txt");
		fs::write(&file_path, "content").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		let entry = entries.iter().find(|e| e.relative_path.as_path() == Path::new("single.txt")).expect("File should be in scan results");

		// Should have nlink = 1 (only itself)
		assert_eq!(entry.nlink, 1, "Single file should have nlink = 1");
	}

	// === Error Handling and Edge Case Tests ===

	#[test]
	fn test_scanner_empty_directory() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 0, "Empty directory should return no entries");
	}

	#[test]
	fn test_scanner_nested_empty_directories() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create nested empty directories
		fs::create_dir_all(root.join("a/b/c/d/e")).unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// Should find only directories, no files
		assert!(entries.iter().all(|e| e.is_dir), "All entries should be directories");
	}

	#[test]
	fn test_scanner_very_long_filename() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create file with very long name (close to 255 byte limit)
		let long_name = "a".repeat(250) + ".txt";
		let file_path = root.join(&long_name);
		fs::write(&file_path, "content").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1);
		assert_eq!(*entries[0].relative_path, PathBuf::from(&long_name));
	}

	#[test]
	fn test_scanner_unicode_filenames() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create files with various Unicode characters
		let unicode_names = vec![
			"测试.txt",   // Chinese
			"テスト.txt", // Japanese
			"тест.txt",   // Russian
			"🦀.txt",     // Emoji
			"café.txt",   // Accented Latin
		];

		for name in &unicode_names {
			fs::write(root.join(name), "content").unwrap();
		}

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), unicode_names.len());
		for name in unicode_names {
			assert!(entries.iter().any(|e| e.relative_path.as_path() == Path::new(name)), "Should find file: {}", name);
		}
	}

	#[test]
	fn test_scanner_special_characters() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create files with special characters (that are valid in filenames)
		let special_names = vec![
			"file with spaces.txt",
			"file-with-dashes.txt",
			"file_with_underscores.txt",
			"file.multiple.dots.txt",
			"file(with)parens.txt",
			"file[with]brackets.txt",
		];

		for name in &special_names {
			fs::write(root.join(name), "content").unwrap();
		}

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), special_names.len());
		for name in special_names {
			assert!(entries.iter().any(|e| e.relative_path.as_path() == Path::new(name)), "Should find file: {}", name);
		}
	}

	#[test]
	fn test_scanner_deep_nesting() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create deeply nested structure (50 levels)
		let mut path = root.to_path_buf();
		for i in 0..50 {
			path.push(format!("level{}", i));
		}
		fs::create_dir_all(&path).unwrap();
		fs::write(path.join("deep.txt"), "content").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// Should find all directories + the file
		assert!(entries.len() >= 51, "Should find deeply nested file and directories");

		// Find the deeply nested file
		let deep_file = entries.iter().find(|e| e.relative_path.ends_with("deep.txt"));
		assert!(deep_file.is_some(), "Should find deeply nested file");
	}

	#[test]
	#[cfg(unix)]
	fn test_scanner_permission_denied_directory() {
		use std::os::unix::fs::PermissionsExt;

		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create a directory and a file inside
		let protected_dir = root.join("protected");
		fs::create_dir(&protected_dir).unwrap();
		fs::write(protected_dir.join("secret.txt"), "secret").unwrap();

		// Make directory unreadable
		let mut perms = fs::metadata(&protected_dir).unwrap().permissions();
		perms.set_mode(0o000);
		fs::set_permissions(&protected_dir, perms.clone()).unwrap();

		let scanner = Scanner::new(root);
		let result = scanner.scan();

		// Restore permissions for cleanup
		perms.set_mode(0o755);
		fs::set_permissions(&protected_dir, perms).unwrap();

		// Scanner should either error or skip the unreadable directory
		// Both behaviors are acceptable
		match result {
			Ok(entries) => {
				// If it succeeds, it should have skipped the protected directory
				assert!(!entries.iter().any(|e| e.path.starts_with(&protected_dir)), "Should not include files from unreadable directory");
			}
			Err(_) => {
				// Error is also acceptable
			}
		}
	}

	#[test]
	fn test_scanner_zero_byte_file() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		let file_path = root.join("empty.txt");
		fs::write(&file_path, "").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1);
		assert_eq!(entries[0].size, 0);
		assert_eq!(*entries[0].relative_path, PathBuf::from("empty.txt"));
	}

	#[test]
	fn test_scanner_large_directory() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create 1000 files
		for i in 0..1000 {
			fs::write(root.join(format!("file{:04}.txt", i)), format!("content{}", i)).unwrap();
		}

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1000, "Should find all 1000 files");
	}

	#[test]
	fn test_scanner_mixed_file_types() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create mix of files and directories
		fs::write(root.join("file1.txt"), "content1").unwrap();
		fs::create_dir(root.join("dir1")).unwrap();
		fs::write(root.join("dir1/file2.txt"), "content2").unwrap();
		fs::create_dir(root.join("dir2")).unwrap();
		fs::write(root.join("file3.txt"), "content3").unwrap();

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		let files: Vec<_> = entries.iter().filter(|e| !e.is_dir).collect();
		let dirs: Vec<_> = entries.iter().filter(|e| e.is_dir).collect();

		assert_eq!(files.len(), 3, "Should find 3 files");
		assert_eq!(dirs.len(), 2, "Should find 2 directories");
	}

	// === Parallel Scanner Tests ===

	#[test]
	fn test_parallel_scanner_basic() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create test structure with multiple subdirs (benefits from parallelism)
		for i in 0..5 {
			let dir = root.join(format!("dir{}", i));
			fs::create_dir(&dir).unwrap();
			for j in 0..10 {
				fs::write(dir.join(format!("file{}.txt", j)), format!("content{}{}", i, j)).unwrap();
			}
		}

		// Test with explicit parallel scanning (4 threads)
		let scanner = Scanner::with_threads(root, 4);
		let entries = scanner.scan().unwrap();

		// Should find 5 directories + 50 files
		assert_eq!(entries.len(), 55, "Should find all 55 entries");

		let files: Vec<_> = entries.iter().filter(|e| !e.is_dir).collect();
		let dirs: Vec<_> = entries.iter().filter(|e| e.is_dir).collect();

		assert_eq!(files.len(), 50, "Should find 50 files");
		assert_eq!(dirs.len(), 5, "Should find 5 directories");
	}

	#[test]
	fn test_parallel_scanner_large() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create 10 subdirectories with 100 files each
		for i in 0..10 {
			let dir = root.join(format!("subdir{:02}", i));
			fs::create_dir(&dir).unwrap();
			for j in 0..100 {
				fs::write(dir.join(format!("file{:03}.txt", j)), format!("content{}", j)).unwrap();
			}
		}

		// Compare sequential vs parallel results
		let sequential = Scanner::with_threads(root, 0).scan().unwrap();
		let parallel = Scanner::with_threads(root, 4).scan().unwrap();

		assert_eq!(sequential.len(), parallel.len(), "Sequential and parallel should find same count");

		// Both should find 10 dirs + 1000 files
		assert_eq!(sequential.len(), 1010);
	}

	#[test]
	fn test_parallel_scanner_preserves_metadata() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create file with specific content
		let file_path = root.join("test.txt");
		let content = "test content for metadata";
		fs::write(&file_path, content).unwrap();

		let scanner = Scanner::with_threads(root, 4);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1);
		let entry = &entries[0];

		assert_eq!(*entry.relative_path, PathBuf::from("test.txt"));
		assert_eq!(entry.size, content.len() as u64);
		assert!(!entry.is_dir);
		assert!(!entry.is_symlink);
	}

	#[test]
	fn test_sequential_fallback_single_thread() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		fs::write(root.join("file.txt"), "content").unwrap();

		// threads=1 should use sequential scanner
		let scanner = Scanner::with_threads(root, 1);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1);
	}

	#[test]
	fn test_sequential_fallback_zero_threads() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		fs::write(root.join("file.txt"), "content").unwrap();

		// threads=0 should use sequential scanner
		let scanner = Scanner::with_threads(root, 0);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 1);
	}

	#[test]
	fn test_auto_select_small_directory() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create small directory (below threshold)
		for i in 0..10 {
			fs::write(root.join(format!("file{}.txt", i)), "content").unwrap();
		}

		// Scanner::new() uses auto_select, should use sequential for small dir
		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		assert_eq!(entries.len(), 10);
	}

	#[test]
	fn test_auto_select_many_subdirs() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create directory with many subdirs (above threshold of 30)
		for i in 0..50 {
			let subdir = root.join(format!("dir{:02}", i));
			fs::create_dir(&subdir).unwrap();
			fs::write(subdir.join("file.txt"), "content").unwrap();
		}

		// Scanner::new() uses auto_select, should use parallel for many subdirs
		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// 50 dirs + 50 files
		assert_eq!(entries.len(), 100);
	}

	#[test]
	fn test_threshold_boundary() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create exactly 30 subdirs (at threshold boundary)
		for i in 0..30 {
			fs::create_dir(root.join(format!("dir{:02}", i))).unwrap();
		}

		// At threshold (30), should NOT trigger parallel (need > 30)
		assert!(!super::should_use_parallel(root));

		// Add one more to cross threshold
		fs::create_dir(root.join("dir30")).unwrap();
		assert!(super::should_use_parallel(root));
	}

	#[test]
	fn test_parallel_sequential_identical_results() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create structure that triggers parallel
		for i in 0..50 {
			let subdir = root.join(format!("dir{:02}", i));
			fs::create_dir(&subdir).unwrap();
			fs::write(subdir.join("file.txt"), format!("content{}", i)).unwrap();
		}

		// Scan with sequential
		let seq_entries = Scanner::with_threads(root, 1).scan().unwrap();
		// Scan with parallel
		let par_entries = Scanner::with_threads(root, 4).scan().unwrap();

		// Same count
		assert_eq!(seq_entries.len(), par_entries.len());

		// Same paths (sorted for comparison)
		let mut seq_paths: Vec<_> = seq_entries.iter().map(|e| e.relative_path.clone()).collect();
		let mut par_paths: Vec<_> = par_entries.iter().map(|e| e.relative_path.clone()).collect();
		seq_paths.sort();
		par_paths.sort();
		assert_eq!(seq_paths, par_paths);

		// Same sizes
		let mut seq_sizes: Vec<_> = seq_entries.iter().map(|e| (e.relative_path.clone(), e.size)).collect();
		let mut par_sizes: Vec<_> = par_entries.iter().map(|e| (e.relative_path.clone(), e.size)).collect();
		seq_sizes.sort_by_key(|(p, _)| p.clone());
		par_sizes.sort_by_key(|(p, _)| p.clone());
		assert_eq!(seq_sizes, par_sizes);
	}

	#[test]
	fn test_mixed_files_and_subdirs() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create mixed structure: files and subdirs at root
		for i in 0..20 {
			fs::write(root.join(format!("rootfile{}.txt", i)), "content").unwrap();
		}
		for i in 0..40 {
			let subdir = root.join(format!("subdir{:02}", i));
			fs::create_dir(&subdir).unwrap();
			fs::write(subdir.join("nested.txt"), "nested").unwrap();
		}

		// 40 subdirs > 30 threshold, should use parallel
		assert!(super::should_use_parallel(root));

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();

		// 20 root files + 40 subdirs + 40 nested files = 100
		assert_eq!(entries.len(), 100);
	}

	#[test]
	fn test_flat_dir_no_parallel() {
		let temp = TempDir::new().unwrap();
		let root = temp.path();

		// Create flat directory with many files but no subdirs
		for i in 0..1000 {
			fs::write(root.join(format!("file{:04}.txt", i)), "content").unwrap();
		}

		// No subdirs, should not trigger parallel
		assert!(!super::should_use_parallel(root));

		let scanner = Scanner::new(root);
		let entries = scanner.scan().unwrap();
		assert_eq!(entries.len(), 1000);
	}
}