msy 0.4.2

Modern musl rsync alternative - Fast, parallel file synchronization
Documentation
/// Scale-related utilities for handling millions of files efficiently
///
/// This module provides optimizations for large-scale syncs:
/// - Bloom filters for O(1) existence checks with minimal memory
/// - Batch processing to avoid loading all files into memory
/// - State caching for incremental syncs
use fastbloom::BloomFilter;
use std::path::{Path, PathBuf};

/// Memory-efficient file set using Bloom filter
///
/// Uses ~10 bits per item with 1% false positive rate.
/// For 1M files: ~1.2MB vs ~100MB for `HashSet<PathBuf>`
///
/// False positives mean we might check a file that doesn't exist,
/// but we'll never miss a file that does exist (no false negatives).
#[derive(Clone)]
pub struct FileSetBloom {
    bloom: BloomFilter,
    #[allow(dead_code)] // Tracking for capacity planning
    expected_items: usize,
}

impl FileSetBloom {
    /// Create a new Bloom filter for approximately `expected_items` files
    ///
    /// Uses a 1% false positive rate, which provides good memory efficiency
    /// while keeping false positive checks minimal.
    ///
    /// Memory usage: ~1.2 bytes per expected item (10 bits per item)
    pub fn new(expected_items: usize) -> Self {
        // False positive rate: 1% (0.01)
        // This gives us ~10 bits per item
        let bloom = BloomFilter::with_false_pos(0.01).expected_items(expected_items);

        Self {
            bloom,
            expected_items,
        }
    }

    /// Add a file path to the set
    pub fn insert(&mut self, path: &Path) {
        // Convert path to bytes for hashing
        let path_bytes = path.as_os_str().as_encoded_bytes();
        self.bloom.insert(path_bytes);
    }

    /// Check if a file path might be in the set
    ///
    /// Returns:
    /// - `true`: Path is probably in the set (may be false positive)
    /// - `false`: Path is definitely NOT in the set (no false negatives)
    ///
    /// For deletion checks: If this returns false, we know for certain the
    /// file doesn't exist in source, so it's safe to delete from destination.
    pub fn contains(&self, path: &Path) -> bool {
        let path_bytes = path.as_os_str().as_encoded_bytes();
        self.bloom.contains(path_bytes)
    }

    /// Get the expected number of items this filter was sized for
    #[allow(dead_code)] // Public API for monitoring
    pub fn expected_items(&self) -> usize {
        self.expected_items
    }

    /// Estimate memory usage in bytes
    #[allow(dead_code)] // Public API for memory monitoring
    pub fn memory_usage(&self) -> usize {
        // Bloom filter uses approximately 10 bits per item at 1% FPR
        // That's 1.25 bytes per item
        (self.expected_items * 10) / 8
    }
}

/// Batch processor for streaming file operations
///
/// Processes files in chunks to balance memory usage and performance.
/// Default batch size is 10,000 files (~1.5MB of metadata).
#[allow(dead_code)] // Infrastructure for future batch processing
pub struct BatchProcessor {
    batch_size: usize,
}

#[allow(dead_code)] // Infrastructure methods for future use
impl BatchProcessor {
    /// Create a new batch processor with default batch size (10,000 files)
    pub fn new() -> Self {
        Self { batch_size: 10_000 }
    }

    /// Create a batch processor with custom batch size
    pub fn with_batch_size(batch_size: usize) -> Self {
        Self { batch_size }
    }

    /// Get the configured batch size
    pub fn batch_size(&self) -> usize {
        self.batch_size
    }
}

impl Default for BatchProcessor {
    fn default() -> Self {
        Self::new()
    }
}

/// State cache for incremental syncs
///
/// Stores the last sync state to enable fast incremental operations.
/// For future implementation.
pub struct StateCache {
    cache_path: PathBuf,
}

impl StateCache {
    /// Create a new state cache at the given path
    #[allow(dead_code)]
    pub fn new(cache_path: PathBuf) -> Self {
        Self { cache_path }
    }

    /// Get the cache file path
    #[allow(dead_code)]
    pub fn cache_path(&self) -> &Path {
        &self.cache_path
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_bloom_filter_basic() {
        let mut bloom = FileSetBloom::new(1000);

        let path1 = PathBuf::from("/test/file1.txt");
        let path2 = PathBuf::from("/test/file2.txt");
        let path3 = PathBuf::from("/test/file3.txt");

        // Insert path1 and path2
        bloom.insert(&path1);
        bloom.insert(&path2);

        // Should find inserted paths
        assert!(bloom.contains(&path1));
        assert!(bloom.contains(&path2));

        // Should not find path3 (very unlikely false positive with small set)
        // Note: This could theoretically fail due to false positive, but
        // with 1% FPR and only 2 items, probability is negligible
        assert!(!bloom.contains(&path3));
    }

    #[test]
    fn test_bloom_filter_many_items() {
        let mut bloom = FileSetBloom::new(10_000);

        // Insert 1000 paths
        let paths: Vec<PathBuf> = (0..1000)
            .map(|i| PathBuf::from(format!("/test/file{}.txt", i)))
            .collect();

        for path in &paths {
            bloom.insert(path);
        }

        // All inserted paths should be found
        for path in &paths {
            assert!(
                bloom.contains(path),
                "Should find inserted path: {:?}",
                path
            );
        }

        // Test a path that wasn't inserted
        let non_existent = PathBuf::from("/test/nonexistent.txt");

        // This should usually return false, but could be a false positive
        // We can't assert false here because of the probabilistic nature
        let _result = bloom.contains(&non_existent);
    }

    #[test]
    fn test_bloom_filter_memory_usage() {
        let bloom = FileSetBloom::new(1_000_000);

        // For 1M items with 1% FPR, we expect ~1.25 bytes per item
        // That's approximately 1.25 MB
        let memory = bloom.memory_usage();
        assert!(memory > 1_000_000, "Memory usage should be > 1MB");
        assert!(memory < 2_000_000, "Memory usage should be < 2MB");

        println!(
            "Memory usage for 1M items: {} bytes ({:.2} MB)",
            memory,
            memory as f64 / 1_000_000.0
        );
    }

    #[test]
    fn test_batch_processor_default() {
        let processor = BatchProcessor::new();
        assert_eq!(processor.batch_size(), 10_000);
    }

    #[test]
    fn test_batch_processor_custom() {
        let processor = BatchProcessor::with_batch_size(5_000);
        assert_eq!(processor.batch_size(), 5_000);
    }

    #[test]
    fn test_state_cache_creation() {
        let cache = StateCache::new(PathBuf::from("/tmp/test.cache"));
        assert_eq!(cache.cache_path(), Path::new("/tmp/test.cache"));
    }
}