Skip to main content

hdf5_reader/
checksum.rs

1/// Jenkins lookup3 `hashlittle2` — used by HDF5 for superblock v2/v3 checksums,
2/// object header v2 checksums, and B-tree v2 checksums.
3///
4/// This is a direct translation of Bob Jenkins' lookup3.c `hashlittle2`.
5/// Reference: <http://burtleburtle.net/bob/c/lookup3.c>
6pub fn jenkins_lookup3(data: &[u8]) -> u32 {
7    let len = data.len();
8
9    // Internal state
10    let mut a: u32 = 0xdeadbeef_u32.wrapping_add(len as u32);
11    let mut b: u32 = a;
12    let mut c: u32 = a;
13
14    let mut offset = 0;
15
16    // Process 12-byte chunks.
17    // Note: the C code uses `while (length > 12)`, leaving the last 0–12
18    // bytes for the switch/final path.  Using `<` (not `<=`) matches that
19    // behaviour — otherwise an input whose length is an exact multiple of
20    // 12 would incorrectly run the last chunk through mix() instead of
21    // the add-then-final path.
22    while offset + 12 < len {
23        a = a.wrapping_add(u32::from_le_bytes([
24            data[offset],
25            data[offset + 1],
26            data[offset + 2],
27            data[offset + 3],
28        ]));
29        b = b.wrapping_add(u32::from_le_bytes([
30            data[offset + 4],
31            data[offset + 5],
32            data[offset + 6],
33            data[offset + 7],
34        ]));
35        c = c.wrapping_add(u32::from_le_bytes([
36            data[offset + 8],
37            data[offset + 9],
38            data[offset + 10],
39            data[offset + 11],
40        ]));
41        mix(&mut a, &mut b, &mut c);
42        offset += 12;
43    }
44
45    // Handle the last few bytes
46    let remaining = len - offset;
47
48    // Fill a, b, c with remaining bytes (little-endian)
49    if remaining > 0 {
50        // Process remaining bytes into a, b, c
51        let mut tail_a: u32 = 0;
52        let mut tail_b: u32 = 0;
53        let mut tail_c: u32 = 0;
54
55        // Bytes 0-3 go into a
56        for i in 0..std::cmp::min(remaining, 4) {
57            tail_a |= (data[offset + i] as u32) << (i * 8);
58        }
59        // Bytes 4-7 go into b
60        if remaining > 4 {
61            for i in 4..std::cmp::min(remaining, 8) {
62                tail_b |= (data[offset + i] as u32) << ((i - 4) * 8);
63            }
64        }
65        // Bytes 8-11 go into c
66        if remaining > 8 {
67            for i in 8..remaining {
68                tail_c |= (data[offset + i] as u32) << ((i - 8) * 8);
69            }
70        }
71
72        a = a.wrapping_add(tail_a);
73        b = b.wrapping_add(tail_b);
74        c = c.wrapping_add(tail_c);
75
76        final_mix(&mut a, &mut b, &mut c);
77    }
78
79    c
80}
81
82#[inline]
83fn mix(a: &mut u32, b: &mut u32, c: &mut u32) {
84    *a = a.wrapping_sub(*c);
85    *a ^= c.rotate_left(4);
86    *c = c.wrapping_add(*b);
87    *b = b.wrapping_sub(*a);
88    *b ^= a.rotate_left(6);
89    *a = a.wrapping_add(*c);
90    *c = c.wrapping_sub(*b);
91    *c ^= b.rotate_left(8);
92    *b = b.wrapping_add(*a);
93    *a = a.wrapping_sub(*c);
94    *a ^= c.rotate_left(16);
95    *c = c.wrapping_add(*b);
96    *b = b.wrapping_sub(*a);
97    *b ^= a.rotate_left(19);
98    *a = a.wrapping_add(*c);
99    *c = c.wrapping_sub(*b);
100    *c ^= b.rotate_left(4);
101    *b = b.wrapping_add(*a);
102}
103
104#[inline]
105fn final_mix(a: &mut u32, b: &mut u32, c: &mut u32) {
106    *c ^= *b;
107    *c = c.wrapping_sub(b.rotate_left(14));
108    *a ^= *c;
109    *a = a.wrapping_sub(c.rotate_left(11));
110    *b ^= *a;
111    *b = b.wrapping_sub(a.rotate_left(25));
112    *c ^= *b;
113    *c = c.wrapping_sub(b.rotate_left(16));
114    *a ^= *c;
115    *a = a.wrapping_sub(c.rotate_left(4));
116    *b ^= *a;
117    *b = b.wrapping_sub(a.rotate_left(14));
118    *c ^= *b;
119    *c = c.wrapping_sub(b.rotate_left(24));
120}
121
122/// Fletcher-32 checksum used by the HDF5 filter pipeline.
123///
124/// Matches the HDF5 library's `H5_checksum_fletcher32`: reads 16-bit
125/// big-endian words, accumulates in batches of 360, and reduces with
126/// `(x & 0xffff) + (x >> 16)` (not `% 65535`).
127pub fn fletcher32(data: &[u8]) -> u32 {
128    let mut sum1: u32 = 0;
129    let mut sum2: u32 = 0;
130    let total_words = data.len() / 2;
131
132    let mut offset = 0usize;
133    let mut remaining = total_words;
134
135    while remaining > 0 {
136        let batch = remaining.min(360);
137        remaining -= batch;
138
139        for _ in 0..batch {
140            let word = ((data[offset] as u32) << 8) | (data[offset + 1] as u32);
141            sum1 += word;
142            sum2 += sum1;
143            offset += 2;
144        }
145
146        sum1 = (sum1 & 0xffff) + (sum1 >> 16);
147        sum2 = (sum2 & 0xffff) + (sum2 >> 16);
148    }
149
150    // Final reduction
151    sum1 = (sum1 & 0xffff) + (sum1 >> 16);
152    sum2 = (sum2 & 0xffff) + (sum2 >> 16);
153
154    (sum2 << 16) | sum1
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    #[test]
162    fn test_jenkins_empty() {
163        let hash = jenkins_lookup3(b"");
164        // Empty input produces a deterministic hash from the initial state
165        assert_ne!(hash, 0);
166    }
167
168    #[test]
169    fn test_jenkins_known_value() {
170        // Test with known input — the hash is deterministic
171        let h1 = jenkins_lookup3(b"hello");
172        let h2 = jenkins_lookup3(b"hello");
173        assert_eq!(h1, h2);
174
175        // Different input -> different hash (with overwhelming probability)
176        let h3 = jenkins_lookup3(b"world");
177        assert_ne!(h1, h3);
178    }
179
180    #[test]
181    fn test_fletcher32_simple() {
182        let data = [0x01, 0x02, 0x03, 0x04];
183        let checksum = fletcher32(&data);
184        // Verify it's deterministic
185        assert_eq!(checksum, fletcher32(&data));
186    }
187
188    #[test]
189    fn test_fletcher32_known_reference() {
190        // Known reference: 4 bytes [0x00, 0x01, 0x00, 0x02] interpreted as
191        // big-endian u16 words: 0x0001 and 0x0002.
192        // sum1 = (0 + 1) % 65535 = 1; sum2 = (0 + 1) % 65535 = 1
193        // sum1 = (1 + 2) % 65535 = 3; sum2 = (1 + 3) % 65535 = 4
194        // result = (4 << 16) | 3 = 0x0004_0003
195        let data = [0x00, 0x01, 0x00, 0x02];
196        assert_eq!(fletcher32(&data), 0x0004_0003);
197    }
198
199    #[test]
200    fn test_fletcher32_roundtrip_with_filter() {
201        // Build a payload + checksum and verify the filter can strip it
202        let payload = vec![0x00u8, 0x80, 0x3F, 0x80, 0x00, 0x00, 0x40, 0x00];
203        let ck = fletcher32(&payload);
204        let mut data = payload.clone();
205        data.extend_from_slice(&ck.to_le_bytes());
206        // The filter stores checksum in big-endian
207        let stripped = crate::filters::fletcher32::verify_and_strip(&data).unwrap();
208        assert_eq!(stripped, payload);
209    }
210}