msy 0.4.6 - Docs.rs

/// Adler-32 rolling hash implementation
///
/// This is the weak checksum used by rsync for fast matching.
/// It's designed to be updated incrementally as a window slides
/// through data.
///
/// Adler-32 computes two 16-bit sums:
/// - A: sum of all bytes
/// - B: sum of (n-i+1) * byte\[i\] for each byte
///
/// The final checksum is (B << 16) | A
///
/// Note: We don't maintain the window in memory since the hash state
/// (a, b) is sufficient for O(1) rolling updates.
#[derive(Debug, Clone)]
pub struct Adler32 {
	a: u32,
	b: u32,
	#[allow(dead_code)]
	block_size: usize,
	// Precomputed table for (block_size * byte) % MOD_ADLER
	// Index is the byte value [0..255]
	n_mod_table: [u32; 256],
}

const MOD_ADLER: u32 = 65521; // Largest prime < 2^16

impl Adler32 {
	/// Create a new Adler-32 hasher
	pub fn new(block_size: usize) -> Self {
		// Precompute lookup table for (block_size * i) % MOD_ADLER
		let mut n_mod_table = [0u32; 256];
		let n = block_size as u32;
		for (i, entry) in n_mod_table.iter_mut().enumerate() {
			*entry = (n * (i as u32)) % MOD_ADLER;
		}

		Self { a: 1, b: 0, block_size, n_mod_table }
	}

	/// Hash a block of data (non-rolling)
	pub fn hash(data: &[u8]) -> u32 {
		let mut a: u32 = 1;
		let mut b: u32 = 0;

		// Process in chunks to defer modulo
		// N=5552: largest n such that 255 * n * (n+1) / 2 + (n+1) * (MOD_ADLER-1) <= 2^32 - 1
		// Safe value is 3800 to be conservative with B accumulation starting from non-zero
		const CHUNK_SIZE: usize = 3800;

		for chunk in data.chunks(CHUNK_SIZE) {
			for &byte in chunk {
				a += byte as u32;
				b += a;
			}
			a %= MOD_ADLER;
			b %= MOD_ADLER;
		}

		(b << 16) | a
	}

	/// Initialize with a full block
	pub fn update_block(&mut self, block: &[u8]) {
		self.a = 1;
		self.b = 0;

		const CHUNK_SIZE: usize = 3800;

		for chunk in block.chunks(CHUNK_SIZE) {
			for &byte in chunk {
				self.a += byte as u32;
				self.b += self.a;
			}
			self.a %= MOD_ADLER;
			self.b %= MOD_ADLER;
		}
	}

	/// Roll the hash: remove old byte, add new byte
	/// This is the key operation for rsync algorithm
	///
	/// Optimized implementation using lookup table and conditional subtraction
	/// to avoid expensive division/modulo operations.
	#[inline]
	pub fn roll(&mut self, old_byte: u8, new_byte: u8) {
		let old = old_byte as u32;
		let new = new_byte as u32;

		// Update A: remove old byte, add new byte
		// Formula: A = (A - old + new) % M
		// Logic:
		// 1. A + new can overflow 65521 slightly (max 65520 + 255 = 65775)
		// 2. Subtract old.
		// 3. If negative (wrapped), add MOD_ADLER.

		let mut a = self.a + new;
		if old > a {
			// Handle wrap-around conceptually (a - old < 0)
			a += MOD_ADLER;
		}
		a -= old;
		if a >= MOD_ADLER {
			a -= MOD_ADLER;
		}
		self.a = a;

		// Update B: remove contribution of old byte across all positions
		// Formula: B = (B - n*old + A - 1) % M
		//
		// 1. B + A - 1. Max ~131000.
		// 2. Subtract (n*old)%M from lookup table.
		// 3. Normalize.

		let n_old_mod = self.n_mod_table[old_byte as usize];

		let mut b = self.b + self.a;
		// b + a can be up to ~131040.
		// We want to subtract (n_old_mod + 1).
		// n_old_mod is < 65521. 1 is 1.
		// So we subtract up to 65522.

		let sub = n_old_mod + 1;
		if sub > b {
			b += MOD_ADLER * 2; // Add enough multiples to ensure positivity
		}
		b -= sub;

		// Now b is positive. Normalize.
		// Since we added at most 2*M, and original sum was ~2*M, result is ~4*M max.
		// Fast modulo by subtraction loop (at most 3-4 iterations)
		while b >= MOD_ADLER {
			b -= MOD_ADLER;
		}
		self.b = b;
	}

	/// Get the current hash value
	pub fn digest(&self) -> u32 {
		(self.b << 16) | self.a
	}

	/// Reset the hasher
	#[allow(dead_code)]
	pub fn reset(&mut self) {
		self.a = 1;
		self.b = 0;
	}
}

#[cfg(test)]
mod tests {
	use super::*;

	#[test]
	fn test_adler32_basic() {
		let data = b"hello world";
		let hash = Adler32::hash(data);
		assert_ne!(hash, 0);
		assert_ne!(hash, 1);
	}

	#[test]
	fn test_adler32_deterministic() {
		let data = b"test data 123";
		assert_eq!(Adler32::hash(data), Adler32::hash(data));
	}

	#[test]
	fn test_adler32_rolling() {
		let data = b"abcdefghijklmnop";
		let block_size = 4;

		// Hash first block statically
		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..4]); // "abcd"
		let hash1 = hasher.digest();

		// Roll to next block
		hasher.roll(data[0], data[4]); // Remove 'a', add 'e'
		let hash2 = hasher.digest();

		// Verify rolling matches static hash
		let expected = Adler32::hash(&data[1..5]); // "bcde"
		assert_eq!(hash2, expected);

		// Hashes should be different
		assert_ne!(hash1, hash2);
	}

	#[test]
	fn test_adler32_rolling_correctness() {
		// Test that rolling hash matches static hash for entire sequence
		let data = b"The quick brown fox jumps over the lazy dog";
		let block_size = 8;

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		for i in 1..=(data.len() - block_size) {
			// Roll to next position
			hasher.roll(data[i - 1], data[i + block_size - 1]);

			// Verify against static hash
			let expected = Adler32::hash(&data[i..i + block_size]);
			assert_eq!(hasher.digest(), expected, "Rolling hash mismatch at position {}", i);
		}
	}

	#[test]
	fn test_adler32_different_data() {
		assert_ne!(Adler32::hash(b"abc"), Adler32::hash(b"def"));
		assert_ne!(Adler32::hash(b"test"), Adler32::hash(b"TEST"));
	}

	#[test]
	fn test_adler32_empty() {
		let hash = Adler32::hash(b"");
		assert_eq!(hash, 1); // Adler-32 of empty data is 1
	}

	#[test]
	fn test_adler32_rolling_large_block() {
		// Test with large block size (128KB)
		let block_size = 128 * 1024;
		let mut data = vec![0u8; block_size * 2];
		for (i, byte) in data.iter_mut().enumerate() {
			*byte = (i % 256) as u8;
		}

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		// Roll forward
		hasher.roll(data[0], data[block_size]);
		let expected = Adler32::hash(&data[1..block_size + 1]);
		assert_eq!(hasher.digest(), expected);
	}

	#[test]
	fn test_adler32_rolling_all_zeros() {
		// Edge case: all zeros
		let data = [0u8; 100];
		let block_size = 10;

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		for i in 1..=(data.len() - block_size) {
			hasher.roll(data[i - 1], data[i + block_size - 1]);
			let expected = Adler32::hash(&data[i..i + block_size]);
			assert_eq!(hasher.digest(), expected);
		}
	}

	#[test]
	fn test_adler32_rolling_all_ones() {
		// Edge case: all 0xFF
		let data = [0xFF; 100];
		let block_size = 16;

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		for i in 1..=(data.len() - block_size) {
			hasher.roll(data[i - 1], data[i + block_size - 1]);
			let expected = Adler32::hash(&data[i..i + block_size]);
			assert_eq!(hasher.digest(), expected);
		}
	}

	#[test]
	fn test_adler32_rolling_repeating_pattern() {
		// Test with repeating pattern
		let pattern = b"ABCD";
		let mut data = Vec::with_capacity(100 * pattern.len());
		for _ in 0..100 {
			data.extend_from_slice(pattern);
		}
		let block_size = 32;

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		for i in 1..=(data.len() - block_size) {
			hasher.roll(data[i - 1], data[i + block_size - 1]);
			let expected = Adler32::hash(&data[i..i + block_size]);
			assert_eq!(hasher.digest(), expected, "Mismatch at position {} with repeating pattern", i);
		}
	}

	#[test]
	fn test_adler32_rolling_modulo_boundary() {
		// Test near MOD_ADLER boundary
		// Create data that will push checksums close to MOD_ADLER
		let block_size = 256;
		let data = vec![0xFF; block_size * 3];

		let mut hasher = Adler32::new(block_size);
		hasher.update_block(&data[0..block_size]);

		for i in 1..block_size {
			hasher.roll(data[i - 1], data[i + block_size - 1]);
			let expected = Adler32::hash(&data[i..i + block_size]);
			assert_eq!(hasher.digest(), expected, "Modulo boundary test failed at position {}", i);
		}
	}
}