Skip to main content

delta/
hash.rs

1//! Karp-Rabin rolling hash (Karp & Rabin 1987; Section 2.1.3).
2//!
3//! Polynomial fingerprints over the Mersenne prime 2^61-1.
4//! Full 61-bit fingerprints are used for collision-free seed comparison;
5//! `fp_to_index` maps them into the bounded hash table via F mod q.
6
7use crate::types::{HASH_BASE, HASH_MOD};
8
9/// Reduce a u128 value modulo the Mersenne prime 2^61-1.
10///
11/// Uses the Mersenne identity: for M = 2^61-1, x mod M = (x >> 61) + (x & M),
12/// with a final correction if the result >= M. No division needed.
13#[inline]
14pub fn mod_mersenne(x: u128) -> u64 {
15    let m = HASH_MOD as u128;
16    let mut r = (x >> 61) + (x & m);
17    if r >= m {
18        r -= m;
19    }
20    // One more reduction in case the first wasn't enough (x >> 61 can be large)
21    let mut r2 = (r >> 61) + (r & m);
22    if r2 >= m {
23        r2 -= m;
24    }
25    r2 as u64
26}
27
28/// Compute the Karp-Rabin fingerprint of data[offset..offset+p] (Eq. 1, Section 2.1.3).
29///
30/// F(X) = (x_0 * b^{p-1} + x_1 * b^{p-2} + ... + x_{p-1}) mod (2^61-1)
31pub fn fingerprint(data: &[u8], offset: usize, p: usize) -> u64 {
32    let mut h: u64 = 0;
33    for i in 0..p {
34        h = mod_mersenne(h as u128 * HASH_BASE as u128 + data[offset + i] as u128);
35    }
36    h
37}
38
39/// Precompute HASH_BASE^{p-1} mod HASH_MOD.
40pub fn precompute_bp(p: usize) -> u64 {
41    if p == 0 {
42        return 1;
43    }
44    let mut result: u64 = 1;
45    let mut base = HASH_BASE;
46    let mut exp = p - 1;
47    // Modular exponentiation by squaring
48    while exp > 0 {
49        if exp & 1 == 1 {
50            result = mod_mersenne(result as u128 * base as u128);
51        }
52        base = mod_mersenne(base as u128 * base as u128);
53        exp >>= 1;
54    }
55    result
56}
57
58/// Map a full fingerprint to a hash table index (F mod q, Section 2.1.3).
59#[inline]
60pub fn fp_to_index(fp: u64, table_size: usize) -> usize {
61    (fp % table_size as u64) as usize
62}
63
64/// Rolling hash for O(1) incremental fingerprint updates.
65///
66/// After `new()`, call `roll()` to slide the window one byte to the right.
67pub struct RollingHash {
68    value: u64,
69    bp: u64, // HASH_BASE^{p-1} mod HASH_MOD
70}
71
72impl RollingHash {
73    /// Create a new RollingHash from data[offset..offset+p].
74    pub fn new(data: &[u8], offset: usize, p: usize) -> Self {
75        let bp = precompute_bp(p);
76        let value = fingerprint(data, offset, p);
77        RollingHash { value, bp }
78    }
79
80    /// Current fingerprint value.
81    #[inline]
82    pub fn value(&self) -> u64 {
83        self.value
84    }
85
86    /// Slide the window: remove `old_byte` from the left, add `new_byte` to the right (Eq. 2).
87    ///
88    /// F(X_{r+1}) = ((F(X_r) - old_byte * b^{p-1}) * b + new_byte) mod (2^61-1)
89    #[inline]
90    pub fn roll(&mut self, old_byte: u8, new_byte: u8) {
91        // Subtract old_byte * bp, using HASH_MOD to keep positive
92        let sub = mod_mersenne(old_byte as u128 * self.bp as u128);
93        let v = if self.value >= sub {
94            self.value - sub
95        } else {
96            HASH_MOD - (sub - self.value)
97        };
98        // Multiply by base and add new_byte
99        self.value = mod_mersenne(v as u128 * HASH_BASE as u128 + new_byte as u128);
100    }
101
102}
103
104// ── CRC-64/XZ checksum (ECMA-182 reflected) ──────────────────────────────
105
106use crate::types::DELTA_CRC_SIZE;
107
108/// CRC-64/XZ lookup table.  Reflected poly: 0xC96C5795D7870F42.
109/// Generated at compile time via const fn.
110const fn make_crc64_table() -> [u64; 256] {
111    let poly: u64 = 0xC96C5795D7870F42;
112    let mut table = [0u64; 256];
113    let mut i = 0usize;
114    while i < 256 {
115        let mut crc = i as u64;
116        let mut j = 0;
117        while j < 8 {
118            if crc & 1 != 0 {
119                crc = (crc >> 1) ^ poly;
120            } else {
121                crc >>= 1;
122            }
123            j += 1;
124        }
125        table[i] = crc;
126        i += 1;
127    }
128    table
129}
130
131static CRC64_TABLE: [u64; 256] = make_crc64_table();
132
133/// Compute CRC-64/XZ of `data`; returns 8 bytes big-endian.
134///
135/// Standard check value: crc64_xz(b"123456789") = 0x995DC9BBDF1939FA.
136/// Empty input: crc64_xz(b"") = 0x0000000000000000.
137pub fn crc64_xz(data: &[u8]) -> [u8; DELTA_CRC_SIZE] {
138    let mut crc: u64 = 0xFFFFFFFFFFFFFFFF;
139    for &byte in data {
140        crc = CRC64_TABLE[((crc ^ byte as u64) & 0xFF) as usize] ^ (crc >> 8);
141    }
142    (crc ^ 0xFFFFFFFFFFFFFFFF).to_be_bytes()
143}
144
145// ── Primality testing (for hash table auto-sizing) ───────────────────────
146
147/// Modular exponentiation: base^exp mod modulus (uses u128 to avoid overflow).
148fn power_mod(base: u64, mut exp: u64, modulus: u64) -> u64 {
149    if modulus == 1 {
150        return 0;
151    }
152    let m = modulus as u128;
153    let mut result: u128 = 1;
154    let mut b: u128 = base as u128 % m;
155    while exp > 0 {
156        if exp & 1 == 1 {
157            result = result * b % m;
158        }
159        exp >>= 1;
160        b = b * b % m;
161    }
162    result as u64
163}
164
165/// Factor n into d * 2^r, returning (d, r).
166fn get_d_r(mut n: u64) -> (u64, u32) {
167    let mut r: u32 = 0;
168    while n % 2 == 0 {
169        n /= 2;
170        r += 1;
171    }
172    (n, r)
173}
174
175/// The witness loop of the Miller-Rabin probabilistic primality test.
176///
177/// Returns `true` if `a` is a witness to the compositeness of `n`
178/// (i.e., n is definitely composite).  Returns `false` if `a` is a
179/// "liar" — n may be prime.
180fn witness(a: u64, n: u64) -> bool {
181    let (d, r) = get_d_r(n - 1);
182    let mut x = power_mod(a, d, n);
183    for _ in 0..r {
184        let y = power_mod(x, 2, n);
185        if y == 1 && x != 1 && x != n - 1 {
186            return true;
187        }
188        x = y;
189    }
190    x != 1
191}
192
193/// Fixed witnesses for deterministic Miller-Rabin.
194///
195/// Sufficient for all n < 3,317,044,064,679,887,385,961,981 (> 2^81).
196/// Jaeschke, Math. Comp. 61(204), 1993.
197const WITNESSES: &[u64] = &[2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37];
198
199/// Deterministic Miller-Rabin primality test.
200///
201/// Writes n-1 = 2^r * d, then for each witness a checks whether
202/// a^d ≡ 1 (mod n) or a^{2^j d} ≡ -1 (mod n) for some j < r.
203/// If neither holds, n is composite.  With the fixed witnesses above,
204/// the test is deterministic for all n relevant to hash table sizing.
205pub fn is_prime(n: usize) -> bool {
206    let n = n as u64;
207    if n < 2 || (n != 2 && n % 2 == 0) {
208        return false;
209    }
210    if n == 2 || n == 3 {
211        return true;
212    }
213    for &a in WITNESSES {
214        if a >= n { break; }
215        if witness(a, n) { return false; }
216    }
217    true
218}
219
220/// Smallest prime >= n.
221///
222/// Searches odd candidates upward from n.  By the prime number theorem,
223/// the expected gap is O(log n), so this terminates quickly.
224pub fn next_prime(n: usize) -> usize {
225    if n <= 2 {
226        return 2;
227    }
228    let mut candidate = if n % 2 == 0 { n + 1 } else { n };
229    while !is_prime(candidate) {
230        candidate += 2;
231    }
232    candidate
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    #[test]
240    fn test_mod_mersenne_basic() {
241        assert_eq!(mod_mersenne(0), 0);
242        assert_eq!(mod_mersenne(HASH_MOD as u128), 0);
243        assert_eq!(mod_mersenne(HASH_MOD as u128 + 1), 1);
244        assert_eq!(mod_mersenne(42), 42);
245    }
246
247    #[test]
248    fn test_fingerprint_deterministic() {
249        let data = b"ABCDEFGHIJKLMNOP";
250        let fp = fingerprint(data, 0, 16);
251        assert_ne!(fp, 0);
252        assert_eq!(fp, fingerprint(data, 0, 16));
253    }
254
255    #[test]
256    fn test_rolling_hash_full_scan() {
257        let data = b"The quick brown fox jumps over the lazy dog.";
258        let p = 8;
259        let mut rh = RollingHash::new(data, 0, p);
260
261        for i in 1..=(data.len() - p) {
262            rh.roll(data[i - 1], data[i + p - 1]);
263            assert_eq!(
264                rh.value(),
265                fingerprint(data, i, p),
266                "mismatch at offset {}",
267                i
268            );
269        }
270    }
271
272    // ── Primality testing ────────────────────────────────────────────────
273
274    #[test]
275    fn test_get_d_r() {
276        assert_eq!(get_d_r(8), (1, 3));
277        assert_eq!(get_d_r(15), (15, 0));
278        let (d, r) = get_d_r(12);
279        assert_eq!(d, 3);
280        assert_eq!(r, 2);
281        assert_eq!(d * (1u64 << r), 12);
282    }
283
284    #[test]
285    fn test_witness_composite() {
286        assert!(witness(2, 9));    // 9 = 3^2 is composite
287        assert!(witness(2, 15));
288    }
289
290    #[test]
291    fn test_witness_prime() {
292        // No a in [2, 13) should be a witness for the prime 13
293        for a in 2..12 {
294            assert!(!witness(a, 13), "a={} should not be a witness for 13", a);
295        }
296    }
297
298    #[test]
299    fn test_known_primes() {
300        let primes = [
301            2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
302            53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113,
303            127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191,
304            193, 197, 199, 211, 223, 227, 229,
305        ];
306        for &p in &primes {
307            assert!(is_prime(p), "{} should be prime", p);
308        }
309    }
310
311    #[test]
312    fn test_known_composites() {
313        let composites = [
314            0, 1, 4, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20,
315            21, 25, 27, 33, 35, 49, 51, 55, 63, 65, 77, 91,
316            100, 121, 143, 169, 221, 1000, 1000000,
317        ];
318        for &c in &composites {
319            assert!(!is_prime(c), "{} should be composite", c);
320        }
321    }
322
323    #[test]
324    fn test_large_primes() {
325        assert!(is_prime(1048573));   // largest prime < 2^20
326        assert!(is_prime(2097143));   // largest prime < 2^21
327        assert!(is_prime(104729));    // 10000th prime
328    }
329
330    #[test]
331    fn test_carmichael_numbers() {
332        // Carmichael numbers pass the Fermat test for all bases
333        // but Miller-Rabin with random witnesses catches them.
334        let carmichaels = [561, 1105, 1729, 2465, 2821, 6601, 8911];
335        for &c in &carmichaels {
336            assert!(!is_prime(c), "Carmichael number {} should be composite", c);
337        }
338    }
339
340    #[test]
341    fn test_next_prime_composite() {
342        assert_eq!(next_prime(8), 11);
343        assert_eq!(next_prime(14), 17);
344        assert_eq!(next_prime(100), 101);
345        assert_eq!(next_prime(1000), 1009);
346    }
347
348    #[test]
349    fn test_next_prime_small() {
350        assert_eq!(next_prime(0), 2);
351        assert_eq!(next_prime(1), 2);
352        assert_eq!(next_prime(2), 2);
353        assert_eq!(next_prime(3), 3);
354    }
355
356    #[test]
357    fn test_next_prime_consecutive() {
358        // Verify next_prime produces valid primes for a range of inputs
359        for n in 2..500 {
360            let np = next_prime(n);
361            assert!(np >= n);
362            assert!(is_prime(np), "next_prime({}) = {} should be prime", n, np);
363        }
364    }
365
366    // ── CRC-64/XZ check values ────────────────────────────────────────────
367
368    #[test]
369    fn test_crc64_empty() {
370        // CRC-64/XZ of empty input is all-zeros.
371        assert_eq!(crc64_xz(b""), [0u8; 8]);
372    }
373
374    #[test]
375    fn test_crc64_check_value() {
376        // Standard check value: CRC-64/XZ of b"123456789" = 0x995DC9BBDF1939FA.
377        let expected: [u8; 8] = [0x99, 0x5D, 0xC9, 0xBB, 0xDF, 0x19, 0x39, 0xFA];
378        assert_eq!(crc64_xz(b"123456789"), expected);
379    }
380}