msy 0.4.2

Modern musl rsync alternative - Fast, parallel file synchronization
Documentation
/// Change ratio detection for delta sync optimization
///
/// Samples blocks from source and destination files to estimate how much
/// has changed. If the change ratio is above a threshold (e.g., >75%),
/// delta sync would be inefficient and we should fallback to full copy.
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;

/// Result of change ratio sampling
#[derive(Debug, Clone)]
pub struct ChangeRatioResult {
    /// Percentage of sampled blocks that differ (0.0 - 1.0)
    pub change_ratio: f64,

    /// Number of blocks sampled
    pub blocks_sampled: usize,

    /// Number of blocks that differed
    pub blocks_changed: usize,

    /// Whether to use delta sync (false = use full copy)
    pub use_delta: bool,

    /// Threshold used for decision
    pub threshold: f64,
}

impl ChangeRatioResult {
    /// Create a new ChangeRatioResult
    pub fn new(
        change_ratio: f64,
        blocks_sampled: usize,
        blocks_changed: usize,
        threshold: f64,
    ) -> Self {
        let use_delta = change_ratio <= threshold;
        Self {
            change_ratio,
            blocks_sampled,
            blocks_changed,
            use_delta,
            threshold,
        }
    }

    /// Format change ratio as percentage string
    pub fn change_ratio_percent(&self) -> String {
        format!("{:.1}%", self.change_ratio * 100.0)
    }
}

/// Sample blocks from source and destination to estimate change ratio
///
/// # Arguments
/// * `source` - Path to source file
/// * `dest` - Path to destination file
/// * `block_size` - Size of each block in bytes
/// * `sample_count` - Number of blocks to sample (default: 20)
/// * `threshold` - Change ratio threshold (default: 0.75 = 75%)
///
/// # Returns
/// * `Ok(ChangeRatioResult)` - Sampling result with decision
/// * `Err` - I/O error during sampling
///
/// # Algorithm
/// 1. Calculate file size and total blocks
/// 2. Sample blocks evenly distributed through the file
/// 3. Compare sampled blocks using fast hash (xxHash3)
/// 4. Calculate change ratio = changed_blocks / sampled_blocks
/// 5. Recommend delta sync if ratio <= threshold
///
/// # Performance
/// - Samples ~20 blocks by default (configurable)
/// - Uses xxHash3 for fast comparison (~15 GB/s)
/// - Overhead: ~2-10ms for typical files
/// - Skips sampling for files <10MB (delta threshold)
pub fn estimate_change_ratio(
    source: &Path,
    dest: &Path,
    block_size: usize,
    sample_count: Option<usize>,
    threshold: Option<f64>,
) -> std::io::Result<ChangeRatioResult> {
    let sample_count = sample_count.unwrap_or(20);
    let threshold = threshold.unwrap_or(0.75);

    // Open files
    let mut source_file = BufReader::with_capacity(256 * 1024, File::open(source)?);
    let mut dest_file = BufReader::with_capacity(256 * 1024, File::open(dest)?);

    // Get file sizes
    let source_size = source_file.get_ref().metadata()?.len();
    let dest_size = dest_file.get_ref().metadata()?.len();

    // Calculate total blocks
    let total_blocks = (dest_size as usize).div_ceil(block_size);

    // Clamp sample count to total blocks
    let sample_count = sample_count.min(total_blocks);

    // If files are very different in size, change ratio is high
    let size_diff_ratio = if dest_size > 0 {
        (source_size as f64 - dest_size as f64).abs() / dest_size as f64
    } else {
        1.0
    };

    // If size differs by >50%, likely high change ratio
    if size_diff_ratio > 0.5 {
        tracing::debug!(
            "Size differs by {:.1}%, assuming high change ratio",
            size_diff_ratio * 100.0
        );
        return Ok(ChangeRatioResult::new(
            size_diff_ratio.min(1.0),
            0,
            0,
            threshold,
        ));
    }

    // Calculate sampling positions (evenly distributed)
    let mut sample_positions = Vec::with_capacity(sample_count);
    let step = if sample_count > 1 {
        total_blocks / (sample_count - 1)
    } else {
        0
    };

    for i in 0..sample_count {
        let block_idx = if sample_count > 1 {
            (i * step).min(total_blocks.saturating_sub(1))
        } else {
            0
        };
        sample_positions.push(block_idx);
    }

    // Sample blocks and compare
    let mut blocks_changed = 0;
    let mut source_block = vec![0u8; block_size];
    let mut dest_block = vec![0u8; block_size];

    for block_idx in &sample_positions {
        let offset = (*block_idx * block_size) as u64;

        // Seek to block position in both files
        source_file.seek(SeekFrom::Start(offset))?;
        dest_file.seek(SeekFrom::Start(offset))?;

        // Read blocks
        let source_read = source_file.read(&mut source_block)?;
        let dest_read = dest_file.read(&mut dest_block)?;

        // If read sizes differ, blocks are different
        if source_read != dest_read {
            blocks_changed += 1;
            continue;
        }

        // Compare blocks using fast hash (xxHash3)
        let source_hash = xxhash_rust::xxh3::xxh3_64(&source_block[..source_read]);
        let dest_hash = xxhash_rust::xxh3::xxh3_64(&dest_block[..dest_read]);

        if source_hash != dest_hash {
            blocks_changed += 1;
        }
    }

    // Calculate change ratio
    let change_ratio = if sample_count > 0 {
        blocks_changed as f64 / sample_count as f64
    } else {
        0.0
    };

    tracing::debug!(
        "Sampled {} blocks: {} changed ({:.1}%), threshold {:.1}%",
        sample_count,
        blocks_changed,
        change_ratio * 100.0,
        threshold * 100.0
    );

    Ok(ChangeRatioResult::new(
        change_ratio,
        sample_count,
        blocks_changed,
        threshold,
    ))
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    #[test]
    fn test_no_changes() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        // Create identical 1MB files
        let data = vec![42u8; 1024 * 1024];
        std::fs::write(&source, &data).unwrap();
        std::fs::write(&dest, &data).unwrap();

        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

        assert_eq!(result.blocks_changed, 0);
        assert_eq!(result.change_ratio, 0.0);
        assert!(result.use_delta);
    }

    #[test]
    fn test_all_changed() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        // Create different 1MB files
        let source_data = vec![42u8; 1024 * 1024];
        let dest_data = vec![99u8; 1024 * 1024];
        std::fs::write(&source, &source_data).unwrap();
        std::fs::write(&dest, &dest_data).unwrap();

        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

        assert_eq!(result.blocks_changed, result.blocks_sampled);
        assert_eq!(result.change_ratio, 1.0);
        assert!(!result.use_delta); // Should fallback to full copy
    }

    #[test]
    fn test_partial_change() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        // Create 1MB files with 25% change
        let mut source_data = vec![42u8; 1024 * 1024];
        let dest_data = vec![42u8; 1024 * 1024];

        // Change first 256KB (25%)
        for byte in &mut source_data[..256 * 1024] {
            *byte = 99;
        }

        std::fs::write(&source, &source_data).unwrap();
        std::fs::write(&dest, &dest_data).unwrap();

        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

        // Should detect some changes, but not all
        assert!(result.blocks_changed > 0);
        assert!(result.blocks_changed < result.blocks_sampled);
        assert!(result.change_ratio > 0.0);
        assert!(result.change_ratio < 1.0);
        assert!(result.use_delta); // <75% changed, use delta
    }

    #[test]
    fn test_threshold_decision() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        // Create 1MB files with 80% change
        let mut source_data = vec![42u8; 1024 * 1024];
        let dest_data = vec![42u8; 1024 * 1024];

        // Change first 800KB (80%)
        for byte in &mut source_data[..800 * 1024] {
            *byte = 99;
        }

        std::fs::write(&source, &source_data).unwrap();
        std::fs::write(&dest, &dest_data).unwrap();

        // With default threshold (75%), should recommend full copy
        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();
        assert!(!result.use_delta);

        // With higher threshold (90%), should use delta
        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, Some(0.90)).unwrap();
        assert!(result.use_delta);
    }

    #[test]
    fn test_size_difference() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        // Create files with very different sizes
        let source_data = vec![42u8; 2 * 1024 * 1024]; // 2MB
        let dest_data = vec![42u8; 1024 * 1024]; // 1MB
        std::fs::write(&source, &source_data).unwrap();
        std::fs::write(&dest, &dest_data).unwrap();

        let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

        // >50% size difference should trigger high change ratio
        assert!(!result.use_delta);
    }

    #[test]
    fn test_small_sample_count() {
        let temp = TempDir::new().unwrap();
        let source = temp.path().join("source.bin");
        let dest = temp.path().join("dest.bin");

        let data = vec![42u8; 1024 * 1024];
        std::fs::write(&source, &data).unwrap();
        std::fs::write(&dest, &data).unwrap();

        // Sample only 5 blocks
        let result = estimate_change_ratio(&source, &dest, 64 * 1024, Some(5), None).unwrap();

        assert_eq!(result.blocks_sampled, 5);
        assert_eq!(result.blocks_changed, 0);
        assert!(result.use_delta);
    }
}