msy 0.4.5

Modern musl rsync alternative - Fast, parallel file synchronization
Documentation
/// Scale-related utilities for handling millions of files efficiently
///
/// This module provides optimizations for large-scale syncs:
/// - Bloom filters for O(1) existence checks with minimal memory
/// - Batch processing to avoid loading all files into memory
/// - State caching for incremental syncs
use fastbloom::BloomFilter;
use std::path::{Path, PathBuf};

/// Memory-efficient file set using Bloom filter
///
/// Uses ~10 bits per item with 1% false positive rate.
/// For 1M files: ~1.2MB vs ~100MB for `HashSet<PathBuf>`
///
/// False positives mean we might check a file that doesn't exist,
/// but we'll never miss a file that does exist (no false negatives).
#[derive(Clone)]
pub struct FileSetBloom {
	bloom: BloomFilter,
	#[allow(dead_code)] // Tracking for capacity planning
	expected_items: usize,
}

impl FileSetBloom {
	/// Create a new Bloom filter for approximately `expected_items` files
	///
	/// Uses a 1% false positive rate, which provides good memory efficiency
	/// while keeping false positive checks minimal.
	///
	/// Memory usage: ~1.2 bytes per expected item (10 bits per item)
	pub fn new(expected_items: usize) -> Self {
		// False positive rate: 1% (0.01)
		// This gives us ~10 bits per item
		let bloom = BloomFilter::with_false_pos(0.01).expected_items(expected_items);

		Self { bloom, expected_items }
	}

	/// Add a file path to the set
	pub fn insert(&mut self, path: &Path) {
		// Convert path to bytes for hashing
		let path_bytes = path.as_os_str().as_encoded_bytes();
		self.bloom.insert(path_bytes);
	}

	/// Check if a file path might be in the set
	///
	/// Returns:
	/// - `true`: Path is probably in the set (may be false positive)
	/// - `false`: Path is definitely NOT in the set (no false negatives)
	///
	/// For deletion checks: If this returns false, we know for certain the
	/// file doesn't exist in source, so it's safe to delete from destination.
	pub fn contains(&self, path: &Path) -> bool {
		let path_bytes = path.as_os_str().as_encoded_bytes();
		self.bloom.contains(path_bytes)
	}

	/// Get the expected number of items this filter was sized for
	#[allow(dead_code)] // Public API for monitoring
	pub fn expected_items(&self) -> usize {
		self.expected_items
	}

	/// Estimate memory usage in bytes
	#[allow(dead_code)] // Public API for memory monitoring
	pub fn memory_usage(&self) -> usize {
		// Bloom filter uses approximately 10 bits per item at 1% FPR
		// That's 1.25 bytes per item
		(self.expected_items * 10) / 8
	}
}

/// Batch processor for streaming file operations
///
/// Processes files in chunks to balance memory usage and performance.
/// Default batch size is 10,000 files (~1.5MB of metadata).
#[allow(dead_code)] // Infrastructure for future batch processing
pub struct BatchProcessor {
	batch_size: usize,
}

#[allow(dead_code)] // Infrastructure methods for future use
impl BatchProcessor {
	/// Create a new batch processor with default batch size (10,000 files)
	pub fn new() -> Self {
		Self { batch_size: 10_000 }
	}

	/// Create a batch processor with custom batch size
	pub fn with_batch_size(batch_size: usize) -> Self {
		Self { batch_size }
	}

	/// Get the configured batch size
	pub fn batch_size(&self) -> usize {
		self.batch_size
	}
}

impl Default for BatchProcessor {
	fn default() -> Self {
		Self::new()
	}
}

/// State cache for incremental syncs
///
/// Stores the last sync state to enable fast incremental operations.
/// For future implementation.
pub struct StateCache {
	cache_path: PathBuf,
}

impl StateCache {
	/// Create a new state cache at the given path
	#[allow(dead_code)]
	pub fn new(cache_path: PathBuf) -> Self {
		Self { cache_path }
	}

	/// Get the cache file path
	#[allow(dead_code)]
	pub fn cache_path(&self) -> &Path {
		&self.cache_path
	}
}

#[cfg(test)]
mod tests {
	use super::*;
	use std::path::PathBuf;

	#[test]
	fn test_bloom_filter_basic() {
		let mut bloom = FileSetBloom::new(1000);

		let path1 = PathBuf::from("/test/file1.txt");
		let path2 = PathBuf::from("/test/file2.txt");
		let path3 = PathBuf::from("/test/file3.txt");

		// Insert path1 and path2
		bloom.insert(&path1);
		bloom.insert(&path2);

		// Should find inserted paths
		assert!(bloom.contains(&path1));
		assert!(bloom.contains(&path2));

		// Should not find path3 (very unlikely false positive with small set)
		// Note: This could theoretically fail due to false positive, but
		// with 1% FPR and only 2 items, probability is negligible
		assert!(!bloom.contains(&path3));
	}

	#[test]
	fn test_bloom_filter_many_items() {
		let mut bloom = FileSetBloom::new(10_000);

		// Insert 1000 paths
		let paths: Vec<PathBuf> = (0..1000).map(|i| PathBuf::from(format!("/test/file{}.txt", i))).collect();

		for path in &paths {
			bloom.insert(path);
		}

		// All inserted paths should be found
		for path in &paths {
			assert!(bloom.contains(path), "Should find inserted path: {:?}", path);
		}

		// Test a path that wasn't inserted
		let non_existent = PathBuf::from("/test/nonexistent.txt");

		// This should usually return false, but could be a false positive
		// We can't assert false here because of the probabilistic nature
		let _result = bloom.contains(&non_existent);
	}

	#[test]
	fn test_bloom_filter_memory_usage() {
		let bloom = FileSetBloom::new(1_000_000);

		// For 1M items with 1% FPR, we expect ~1.25 bytes per item
		// That's approximately 1.25 MB
		let memory = bloom.memory_usage();
		assert!(memory > 1_000_000, "Memory usage should be > 1MB");
		assert!(memory < 2_000_000, "Memory usage should be < 2MB");

		println!("Memory usage for 1M items: {} bytes ({:.2} MB)", memory, memory as f64 / 1_000_000.0);
	}

	#[test]
	fn test_batch_processor_default() {
		let processor = BatchProcessor::new();
		assert_eq!(processor.batch_size(), 10_000);
	}

	#[test]
	fn test_batch_processor_custom() {
		let processor = BatchProcessor::with_batch_size(5_000);
		assert_eq!(processor.batch_size(), 5_000);
	}

	#[test]
	fn test_state_cache_creation() {
		let cache = StateCache::new(PathBuf::from("/tmp/test.cache"));
		assert_eq!(cache.cache_path(), Path::new("/tmp/test.cache"));
	}
}