msy 0.4.5

Modern musl rsync alternative - Fast, parallel file synchronization
Documentation
/// Change ratio detection for delta sync optimization
///
/// Samples blocks from source and destination files to estimate how much
/// has changed. If the change ratio is above a threshold (e.g., >75%),
/// delta sync would be inefficient and we should fallback to full copy.
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;

/// Result of change ratio sampling
#[derive(Debug, Clone)]
pub struct ChangeRatioResult {
	/// Percentage of sampled blocks that differ (0.0 - 1.0)
	pub change_ratio: f64,

	/// Number of blocks sampled
	pub blocks_sampled: usize,

	/// Number of blocks that differed
	pub blocks_changed: usize,

	/// Whether to use delta sync (false = use full copy)
	pub use_delta: bool,

	/// Threshold used for decision
	pub threshold: f64,
}

impl ChangeRatioResult {
	/// Create a new ChangeRatioResult
	pub fn new(change_ratio: f64, blocks_sampled: usize, blocks_changed: usize, threshold: f64) -> Self {
		let use_delta = change_ratio <= threshold;
		Self { change_ratio, blocks_sampled, blocks_changed, use_delta, threshold }
	}

	/// Format change ratio as percentage string
	pub fn change_ratio_percent(&self) -> String {
		format!("{:.1}%", self.change_ratio * 100.0)
	}
}

/// Sample blocks from source and destination to estimate change ratio
///
/// # Arguments
/// * `source` - Path to source file
/// * `dest` - Path to destination file
/// * `block_size` - Size of each block in bytes
/// * `sample_count` - Number of blocks to sample (default: 20)
/// * `threshold` - Change ratio threshold (default: 0.75 = 75%)
///
/// # Returns
/// * `Ok(ChangeRatioResult)` - Sampling result with decision
/// * `Err` - I/O error during sampling
///
/// # Algorithm
/// 1. Calculate file size and total blocks
/// 2. Sample blocks evenly distributed through the file
/// 3. Compare sampled blocks using fast hash (xxHash3)
/// 4. Calculate change ratio = changed_blocks / sampled_blocks
/// 5. Recommend delta sync if ratio <= threshold
///
/// # Performance
/// - Samples ~20 blocks by default (configurable)
/// - Uses xxHash3 for fast comparison (~15 GB/s)
/// - Overhead: ~2-10ms for typical files
/// - Skips sampling for files <10MB (delta threshold)
pub fn estimate_change_ratio(
	source: &Path, dest: &Path, block_size: usize, sample_count: Option<usize>, threshold: Option<f64>,
) -> std::io::Result<ChangeRatioResult> {
	let sample_count = sample_count.unwrap_or(20);
	let threshold = threshold.unwrap_or(0.75);

	// Open files
	let mut source_file = BufReader::with_capacity(256 * 1024, File::open(source)?);
	let mut dest_file = BufReader::with_capacity(256 * 1024, File::open(dest)?);

	// Get file sizes
	let source_size = source_file.get_ref().metadata()?.len();
	let dest_size = dest_file.get_ref().metadata()?.len();

	// Calculate total blocks
	let total_blocks = (dest_size as usize).div_ceil(block_size);

	// Clamp sample count to total blocks
	let sample_count = sample_count.min(total_blocks);

	// If files are very different in size, change ratio is high
	let size_diff_ratio = if dest_size > 0 { (source_size as f64 - dest_size as f64).abs() / dest_size as f64 } else { 1.0 };

	// If size differs by >50%, likely high change ratio
	if size_diff_ratio > 0.5 {
		tracing::debug!("Size differs by {:.1}%, assuming high change ratio", size_diff_ratio * 100.0);
		return Ok(ChangeRatioResult::new(size_diff_ratio.min(1.0), 0, 0, threshold));
	}

	// Calculate sampling positions (evenly distributed)
	let mut sample_positions = Vec::with_capacity(sample_count);
	let step = if sample_count > 1 { total_blocks / (sample_count - 1) } else { 0 };

	for i in 0..sample_count {
		let block_idx = if sample_count > 1 { (i * step).min(total_blocks.saturating_sub(1)) } else { 0 };
		sample_positions.push(block_idx);
	}

	// Sample blocks and compare
	let mut blocks_changed = 0;
	let mut source_block = vec![0u8; block_size];
	let mut dest_block = vec![0u8; block_size];

	for block_idx in &sample_positions {
		let offset = (*block_idx * block_size) as u64;

		// Seek to block position in both files
		source_file.seek(SeekFrom::Start(offset))?;
		dest_file.seek(SeekFrom::Start(offset))?;

		// Read blocks
		let source_read = source_file.read(&mut source_block)?;
		let dest_read = dest_file.read(&mut dest_block)?;

		// If read sizes differ, blocks are different
		if source_read != dest_read {
			blocks_changed += 1;
			continue;
		}

		// Compare blocks using fast hash (xxHash3)
		let source_hash = xxhash_rust::xxh3::xxh3_64(&source_block[..source_read]);
		let dest_hash = xxhash_rust::xxh3::xxh3_64(&dest_block[..dest_read]);

		if source_hash != dest_hash {
			blocks_changed += 1;
		}
	}

	// Calculate change ratio
	let change_ratio = if sample_count > 0 { blocks_changed as f64 / sample_count as f64 } else { 0.0 };

	tracing::debug!(
		"Sampled {} blocks: {} changed ({:.1}%), threshold {:.1}%",
		sample_count,
		blocks_changed,
		change_ratio * 100.0,
		threshold * 100.0
	);

	Ok(ChangeRatioResult::new(change_ratio, sample_count, blocks_changed, threshold))
}

#[cfg(test)]
mod tests {
	use super::*;
	use tempfile::TempDir;

	#[test]
	fn test_no_changes() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		// Create identical 1MB files
		let data = vec![42u8; 1024 * 1024];
		std::fs::write(&source, &data).unwrap();
		std::fs::write(&dest, &data).unwrap();

		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

		assert_eq!(result.blocks_changed, 0);
		assert_eq!(result.change_ratio, 0.0);
		assert!(result.use_delta);
	}

	#[test]
	fn test_all_changed() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		// Create different 1MB files
		let source_data = vec![42u8; 1024 * 1024];
		let dest_data = vec![99u8; 1024 * 1024];
		std::fs::write(&source, &source_data).unwrap();
		std::fs::write(&dest, &dest_data).unwrap();

		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

		assert_eq!(result.blocks_changed, result.blocks_sampled);
		assert_eq!(result.change_ratio, 1.0);
		assert!(!result.use_delta); // Should fallback to full copy
	}

	#[test]
	fn test_partial_change() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		// Create 1MB files with 25% change
		let mut source_data = vec![42u8; 1024 * 1024];
		let dest_data = vec![42u8; 1024 * 1024];

		// Change first 256KB (25%)
		for byte in &mut source_data[..256 * 1024] {
			*byte = 99;
		}

		std::fs::write(&source, &source_data).unwrap();
		std::fs::write(&dest, &dest_data).unwrap();

		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

		// Should detect some changes, but not all
		assert!(result.blocks_changed > 0);
		assert!(result.blocks_changed < result.blocks_sampled);
		assert!(result.change_ratio > 0.0);
		assert!(result.change_ratio < 1.0);
		assert!(result.use_delta); // <75% changed, use delta
	}

	#[test]
	fn test_threshold_decision() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		// Create 1MB files with 80% change
		let mut source_data = vec![42u8; 1024 * 1024];
		let dest_data = vec![42u8; 1024 * 1024];

		// Change first 800KB (80%)
		for byte in &mut source_data[..800 * 1024] {
			*byte = 99;
		}

		std::fs::write(&source, &source_data).unwrap();
		std::fs::write(&dest, &dest_data).unwrap();

		// With default threshold (75%), should recommend full copy
		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();
		assert!(!result.use_delta);

		// With higher threshold (90%), should use delta
		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, Some(0.90)).unwrap();
		assert!(result.use_delta);
	}

	#[test]
	fn test_size_difference() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		// Create files with very different sizes
		let source_data = vec![42u8; 2 * 1024 * 1024]; // 2MB
		let dest_data = vec![42u8; 1024 * 1024]; // 1MB
		std::fs::write(&source, &source_data).unwrap();
		std::fs::write(&dest, &dest_data).unwrap();

		let result = estimate_change_ratio(&source, &dest, 64 * 1024, None, None).unwrap();

		// >50% size difference should trigger high change ratio
		assert!(!result.use_delta);
	}

	#[test]
	fn test_small_sample_count() {
		let temp = TempDir::new().unwrap();
		let source = temp.path().join("source.bin");
		let dest = temp.path().join("dest.bin");

		let data = vec![42u8; 1024 * 1024];
		std::fs::write(&source, &data).unwrap();
		std::fs::write(&dest, &data).unwrap();

		// Sample only 5 blocks
		let result = estimate_change_ratio(&source, &dest, 64 * 1024, Some(5), None).unwrap();

		assert_eq!(result.blocks_sampled, 5);
		assert_eq!(result.blocks_changed, 0);
		assert!(result.use_delta);
	}
}