Skip to main content

frozen_core/
crc32.rs

1//! Implementation of CRC32C (Castagnoli polynomial) to compute a 32-bit cyclic redundancy check (CRC) using
2//! Castagnoli polynomial, intended for data integrity verification for torn writes and curruption detection
3//!
4//! > [!WARNING]
5//! > We assume little-endian target architecture, while big-endian architectures are not supported
6//!
7//! > [!IMPORTANT]
8//! > The generated 32-bit CRC is not cryptographically secure, it's intended use only is for data integrity in IO ops
9//!
10//! ## What is CRC?
11//!
12//! Cyclic Redundancy Check (CRC) is an err detecting code, e.g. 32 bits, used for data integrity in networking
13//! protocols and storage systems, to check whether data has been altered, partially written/send or corrupted
14//!
15//! For a given buffer of data, e.g. `[u8; 0x100]`, we first compute a 32-bit crc, send/store it w/ the data, and
16//! when received/read, again compute crc on the buffer and match it w/ the original, to check for integrity
17//!
18//! ## CRC32C
19//!
20//! CRC32C is a specific version of crc algo, which generates 32-bit crc using the Castagnoli polynomial
21//!
22//! It uses polynomial arithmetic in GF(2), the generator polynomial for CRC32C is `0x1EDC6F41`, and we use the
23//! reflected form `0x82F63B78`, which is required by little-endian streaming algorithms
24//!
25//! ## Benchmarks
26//!
27//! Look at following benches for the throughout and latency measurements,
28//!
29//! ```md
30//! | Mode | Size    | Time (ns / µs)        | Throughput (GiB/s) |
31//! |:----:|:-------:|:---------------------:|:------------------:|
32//! | 1x   | 4 KiB   | 318 ns                | 11.97              |
33//! | 2x   | 4 KiB   | 340 ns                | 11.24              |
34//! | 4x   | 4 KiB   | 451 ns                | 8.46               |
35//! | 1x   | 64 KiB  | 5.44 µs               | 11.23              |
36//! | 2x   | 64 KiB  | 5.26 µs               | 11.60              |
37//! | 4x   | 64 KiB  | 7.50 µs               | 8.14               |
38//! | 1x   | 1 MiB   | 89.80 µs              | 10.88              |
39//! | 2x   | 1 MiB   | 90.56 µs              | 10.78              |
40//! | 4x   | 1 MiB   | 120.04 µs             | 8.13               |
41//! ```
42//!
43//! Environment used for benching,
44//!
45//! - OS: NixOS (WSL2)
46//! - Architecture: x86_64
47//! - Memory: 8 GiB RAM (DDR4)
48//! - Backend: Hardware (SSE4.2)
49//! - Rust: rustc 1.86.0 w/ cargo 1.86.0
50//! - Kernel: Linux 6.6.87.2-microsoft-standard-WSL2
51//! - CPU: Intel® Core™ i5-10300H @ 2.50GHz (4C / 8T)
52//!
53//! ## Example
54//!
55//! ```
56//! let crc = frozen_core::crc32::Crc32C::default();
57//!
58//! let b0: [u8; 8] = *b"12345678";
59//! let b1: [u8; 8] = *b"ABCDEFGH";
60//! let b2: [u8; 8] = *b"abcdefgh";
61//! let b3: [u8; 8] = *b"zyxwvuts";
62//!
63//! assert_eq!(crc.crc(&b0), crc.crc(&b0));
64//! assert_eq!(crc.crc_2x([&b0, &b1]),crc.crc_2x([&b0, &b1]));
65//! assert_eq!(crc.crc_4x([&b0, &b1, &b2, &b3]),crc.crc_4x([&b0, &b1, &b2, &b3]));
66//! ```
67
68#[derive(Debug, PartialEq, Clone, Copy)]
69enum Backend {
70    Hardware,
71    Software,
72}
73
74type TCrc = u32;
75type TBuf = [u8];
76
77/// Implementation of CRC32C (Castagnoli polynomial) to compute a 32-bit cyclic redundancy check (CRC) using
78/// Castagnoli polynomial
79///
80/// ## Example
81///
82/// ```
83/// let crc = frozen_core::crc32::Crc32C::default();
84///
85/// let b0: [u8; 8] = *b"12345678";
86/// let b1: [u8; 8] = *b"ABCDEFGH";
87/// let b2: [u8; 8] = *b"abcdefgh";
88/// let b3: [u8; 8] = *b"zyxwvuts";
89///
90/// assert_eq!(crc.crc(&b0), crc.crc(&b0));
91/// assert_eq!(crc.crc_2x([&b0, &b1]),crc.crc_2x([&b0, &b1]));
92/// assert_eq!(crc.crc_4x([&b0, &b1, &b2, &b3]),crc.crc_4x([&b0, &b1, &b2, &b3]));
93/// ```
94#[derive(Debug, PartialEq, Clone, Copy)]
95pub struct Crc32C(Backend);
96
97impl Crc32C {
98    /// Create a new instance of [`Crc32C`]
99    ///
100    /// ## CRC backend (Hardware vs Software)
101    ///
102    /// When the new instance is created, we select the fastest available backend,
103    ///
104    /// - On x86_64 we use `sse4.2` SIMD instructions when available
105    /// - On aarch64 we use ARMv8 `crc` instructions when available
106    /// - Otherwise, fallback to portable software implementation
107    ///
108    /// Hardware acceleration is significantly faster than the software fallback, while both backend producing
109    /// exact same CRC values
110    ///
111    /// ## Example
112    ///
113    /// ```
114    /// let crc = frozen_core::crc32::Crc32C::default();
115    ///
116    /// let b0: [u8; 8] = *b"12345678";
117    /// let b1: [u8; 8] = *b"ABCDEFGH";
118    /// let b2: [u8; 8] = *b"abcdefgh";
119    /// let b3: [u8; 8] = *b"zyxwvuts";
120    ///
121    /// assert_eq!(crc.crc(&b0), crc.crc(&b0));
122    /// assert_eq!(crc.crc_2x([&b0, &b1]),crc.crc_2x([&b0, &b1]));
123    /// assert_eq!(crc.crc_4x([&b0, &b1, &b2, &b3]),crc.crc_4x([&b0, &b1, &b2, &b3]));
124    /// ```
125    pub fn new() -> Self {
126        #[cfg(target_arch = "x86_64")]
127        if std::is_x86_feature_detected!("sse4.2") {
128            return Self(Backend::Hardware);
129        }
130
131        #[cfg(target_arch = "aarch64")]
132        if std::arch::is_aarch64_feature_detected!("crc") {
133            return Self(Backend::Hardware);
134        }
135
136        Self(Backend::Software)
137    }
138
139    /// Checks whether hardware acceleration is available at runtime
140    /// ```
141    /// let crc = frozen_core::crc32::Crc32C::default();
142    ///
143    /// #[cfg(target_arch = "x86_64")]
144    /// let expected = std::is_x86_feature_detected!("sse4.2");
145    ///
146    /// #[cfg(target_arch = "aarch64")]
147    /// let expected = std::arch::is_aarch64_feature_detected!("crc");
148    ///
149    /// assert_eq!(crc.is_hardware_acceleration_available(), expected);    
150    /// ```
151    pub fn is_hardware_acceleration_available(&self) -> bool {
152        self.0 == Backend::Hardware
153    }
154
155    /// Compute CRC32C for given `buf` to generate 32-bit crc
156    ///
157    /// **NOTE:** Length of `buf` must be a multiple of 8
158    ///
159    /// At runtime we choose the fastest available backend,
160    ///
161    /// - x86_64: `sse4.2` when available
162    /// - aarch64: ArmV8 `crc` instruction when available
163    /// - fallback: software castagnoli impl
164    ///
165    /// All backends produce identical CRC32C values
166    ///
167    /// ## Example
168    ///
169    /// ```
170    /// let crc = frozen_core::crc32::Crc32C::default();
171    ///
172    /// let b0: [u8; 8] = *b"12345678";
173    /// assert_eq!(crc.crc(&b0), crc.crc(&b0));
174    /// ```
175    pub fn crc(&self, buf: &TBuf) -> TCrc {
176        match self.0 {
177            Backend::Software => crc32c_slice8(buf),
178            Backend::Hardware => unsafe { crc32c_hardware(buf) },
179        }
180    }
181
182    /// Compute CRC32C for two bufs in parallel using `Instruction Level Parallelism` to generate 32-bit crc
183    /// for each buf (mapped one-to-one)
184    ///
185    /// **NOTE:** Length of each `buf` must be equal and a multiple of 8
186    ///
187    /// At runtime we choose the fastest available backend,
188    ///
189    /// - x86_64: `sse4.2` when available
190    /// - aarch64: ArmV8 `crc` instruction when available
191    /// - fallback: software castagnoli impl
192    ///
193    /// All backends produce identical CRC32C values
194    ///
195    /// ## Example
196    ///
197    /// ```
198    /// let crc = frozen_core::crc32::Crc32C::default();
199    ///
200    /// let b0: [u8; 8] = *b"12345678";
201    /// let b1: [u8; 8] = *b"ABCDEFGH";
202    ///
203    /// assert_eq!(crc.crc_2x([&b0, &b1]),crc.crc_2x([&b0, &b1]));
204    /// ```
205    pub fn crc_2x(&self, buffers: [&TBuf; 2]) -> [TCrc; 2] {
206        match self.0 {
207            Backend::Software => crc32c_slice8_2x(buffers),
208            Backend::Hardware => unsafe { crc32c_hardware_2x(buffers) },
209        }
210    }
211
212    /// Compute CRC32C for four bufs in parallel using `Instruction Level Parallelism` to generate 32-bit crc
213    /// for each buf (mapped one-to-one)
214    ///
215    /// **NOTE:** Length of each `buf` must be equal and a multiple of 8
216    ///
217    /// At runtime we choose the fastest available backend,
218    ///
219    /// - x86_64: `sse4.2` when available
220    /// - aarch64: ArmV8 `crc` instruction when available
221    /// - fallback: software castagnoli impl
222    ///
223    /// All backends produce identical CRC32C values
224    /// ## Example
225    ///
226    /// ```
227    /// let crc = frozen_core::crc32::Crc32C::default();
228    ///
229    /// let b0: [u8; 8] = *b"12345678";
230    /// let b1: [u8; 8] = *b"ABCDEFGH";
231    /// let b2: [u8; 8] = *b"abcdefgh";
232    /// let b3: [u8; 8] = *b"zyxwvuts";
233    ///
234    /// assert_eq!(crc.crc_4x([&b0, &b1, &b2, &b3]),crc.crc_4x([&b0, &b1, &b2, &b3]));
235    /// ```
236    pub fn crc_4x(&self, buffers: [&TBuf; 4]) -> [TCrc; 4] {
237        match self.0 {
238            Backend::Software => crc32c_slice8_4x(buffers),
239            Backend::Hardware => unsafe { crc32c_hardware_4x(buffers) },
240        }
241    }
242}
243
244impl Default for Crc32C {
245    fn default() -> Self {
246        Self::new()
247    }
248}
249
250/// Polynomial in refelected form (little endian) over `0x1EDC6F41`, used by CRC32C algorithm as init value
251const CASTAGNOLI_POLYNOMIAL: TCrc = 0x82F63B78;
252
253/// Precomputed table of CRC32C values, for runtime perf by reducing computations at runtime,
254///
255/// ## Memory Footprint
256///
257/// Memory used is `0x100 * 4 * 8` i.e. `8 KiB`, which is stored in `.rodata` section
258const CRC_TABLE: [[TCrc; 0x100]; 8] = {
259    let mut table = [[0; 0x100]; 8];
260
261    // table[0]
262    let mut i = 0;
263    while i < 0x100 {
264        let mut crc = i as TCrc;
265        let mut j = 0;
266
267        while j < 8 {
268            if (crc & 1) != 0 {
269                crc = (crc >> 1) ^ CASTAGNOLI_POLYNOMIAL;
270            } else {
271                crc >>= 1;
272            }
273
274            j += 1;
275        }
276
277        table[0][i] = crc;
278        i += 1;
279    }
280
281    // derive table[1..7] from table[0]
282    let mut t = 1;
283    while t < 8 {
284        let mut n = 0;
285        while n < 0x100 {
286            let crc = table[t - 1][n];
287            table[t][n] = (crc >> 8) ^ table[0][(crc & 0xFF) as usize];
288            n += 1;
289        }
290
291        t += 1;
292    }
293
294    table
295};
296
297/// Compute CRC32C for `buf` w/ Software Castagnoli impl, to generate 32-bit crc
298///
299/// **NOTE:** Length of buffer must be a multiple of 8
300#[inline(always)]
301fn crc32c_slice8(buf: &TBuf) -> TCrc {
302    // sanity check
303    debug_assert!(buf.len() & 7 == 0);
304
305    let table = &CRC_TABLE[..];
306
307    let mut crc: TCrc = !0;
308    let mut len = buf.len();
309    let mut ptr = buf.as_ptr();
310
311    while len > 0 {
312        let word: u64 = unsafe { core::ptr::read_unaligned(ptr as *const u64) };
313        let hi = (word >> 0x20) as TCrc;
314        let w = word as TCrc;
315        let x = crc ^ w;
316
317        crc = table[7][(x & 0xFF) as usize]
318            ^ table[6][((x >> 8) & 0xFF) as usize]
319            ^ table[5][((x >> 0x10) & 0xFF) as usize]
320            ^ table[4][((x >> 0x18) & 0xFF) as usize]
321            ^ table[3][(hi & 0xFF) as usize]
322            ^ table[2][((hi >> 8) & 0xFF) as usize]
323            ^ table[1][((hi >> 0x10) & 0xFF) as usize]
324            ^ table[0][((hi >> 0x18) & 0xFF) as usize];
325
326        ptr = unsafe { ptr.add(8) };
327        len -= 8;
328    }
329
330    !crc
331}
332
333/// Compute CRC32C for two bufs in parallel using `Instruction Level Parallelism` w/ Software Castagnoli impl,
334/// to generate 32-bit crc for each buf (mapped one-to-one)
335///
336/// **NOTE:** Length of each buf must be equal and a multiple of 8
337#[inline(always)]
338fn crc32c_slice8_2x(buffers: [&TBuf; 2]) -> [TCrc; 2] {
339    let mut len = buffers[0].len();
340
341    // sanity checks
342    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
343    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
344
345    let table = &CRC_TABLE[..];
346
347    let mut crc0: TCrc = !0;
348    let mut crc1: TCrc = !0;
349
350    let mut p0 = buffers[0].as_ptr();
351    let mut p1 = buffers[1].as_ptr();
352
353    while len > 0 {
354        unsafe {
355            let w0: u64 = core::ptr::read_unaligned(p0 as *const u64);
356            let w1: u64 = core::ptr::read_unaligned(p1 as *const u64);
357
358            let lo0 = w0 as TCrc;
359            let lo1 = w1 as TCrc;
360
361            let hi0 = (w0 >> 0x20) as TCrc;
362            let hi1 = (w1 >> 0x20) as TCrc;
363
364            let x0 = crc0 ^ lo0;
365            let x1 = crc1 ^ lo1;
366
367            crc0 = table[7][(x0 & 0xFF) as usize]
368                ^ table[6][((x0 >> 8) & 0xFF) as usize]
369                ^ table[5][((x0 >> 0x10) & 0xFF) as usize]
370                ^ table[4][((x0 >> 0x18) & 0xFF) as usize]
371                ^ table[3][(hi0 & 0xFF) as usize]
372                ^ table[2][((hi0 >> 8) & 0xFF) as usize]
373                ^ table[1][((hi0 >> 0x10) & 0xFF) as usize]
374                ^ table[0][((hi0 >> 0x18) & 0xFF) as usize];
375
376            crc1 = table[7][(x1 & 0xFF) as usize]
377                ^ table[6][((x1 >> 8) & 0xFF) as usize]
378                ^ table[5][((x1 >> 0x10) & 0xFF) as usize]
379                ^ table[4][((x1 >> 0x18) & 0xFF) as usize]
380                ^ table[3][(hi1 & 0xFF) as usize]
381                ^ table[2][((hi1 >> 8) & 0xFF) as usize]
382                ^ table[1][((hi1 >> 0x10) & 0xFF) as usize]
383                ^ table[0][((hi1 >> 0x18) & 0xFF) as usize];
384
385            p0 = p0.add(8);
386            p1 = p1.add(8);
387        }
388
389        len -= 8;
390    }
391
392    [!crc0, !crc1]
393}
394
395/// Compute CRC32C for four bufs in parallel using `Instruction Level Parallelism` w/ Software Castagnoli impl,
396/// to generate 32-bit crc for each buf (mapped one-to-one)
397///
398/// **NOTE:** Length of each buf must be equal and a multiple of 8
399#[inline(always)]
400fn crc32c_slice8_4x(buffers: [&TBuf; 4]) -> [TCrc; 4] {
401    let mut len = buffers[0].len();
402
403    // sanity checks
404    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
405    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
406
407    let table = &CRC_TABLE[..];
408
409    let mut crc0: TCrc = !0;
410    let mut crc1: TCrc = !0;
411    let mut crc2: TCrc = !0;
412    let mut crc3: TCrc = !0;
413
414    let mut p0 = buffers[0].as_ptr();
415    let mut p1 = buffers[1].as_ptr();
416    let mut p2 = buffers[2].as_ptr();
417    let mut p3 = buffers[3].as_ptr();
418
419    while len > 0 {
420        unsafe {
421            let w0: u64 = core::ptr::read_unaligned(p0 as *const u64);
422            let w1: u64 = core::ptr::read_unaligned(p1 as *const u64);
423            let w2: u64 = core::ptr::read_unaligned(p2 as *const u64);
424            let w3: u64 = core::ptr::read_unaligned(p3 as *const u64);
425
426            let lo0 = w0 as TCrc;
427            let lo1 = w1 as TCrc;
428            let lo2 = w2 as TCrc;
429            let lo3 = w3 as TCrc;
430
431            let hi0 = (w0 >> 0x20) as TCrc;
432            let hi1 = (w1 >> 0x20) as TCrc;
433            let hi2 = (w2 >> 0x20) as TCrc;
434            let hi3 = (w3 >> 0x20) as TCrc;
435
436            let x0 = crc0 ^ lo0;
437            let x1 = crc1 ^ lo1;
438            let x2 = crc2 ^ lo2;
439            let x3 = crc3 ^ lo3;
440
441            crc0 = table[7][(x0 & 0xFF) as usize]
442                ^ table[6][((x0 >> 8) & 0xFF) as usize]
443                ^ table[5][((x0 >> 0x10) & 0xFF) as usize]
444                ^ table[4][((x0 >> 0x18) & 0xFF) as usize]
445                ^ table[3][(hi0 & 0xFF) as usize]
446                ^ table[2][((hi0 >> 8) & 0xFF) as usize]
447                ^ table[1][((hi0 >> 0x10) & 0xFF) as usize]
448                ^ table[0][((hi0 >> 0x18) & 0xFF) as usize];
449
450            crc1 = table[7][(x1 & 0xFF) as usize]
451                ^ table[6][((x1 >> 8) & 0xFF) as usize]
452                ^ table[5][((x1 >> 0x10) & 0xFF) as usize]
453                ^ table[4][((x1 >> 0x18) & 0xFF) as usize]
454                ^ table[3][(hi1 & 0xFF) as usize]
455                ^ table[2][((hi1 >> 8) & 0xFF) as usize]
456                ^ table[1][((hi1 >> 0x10) & 0xFF) as usize]
457                ^ table[0][((hi1 >> 0x18) & 0xFF) as usize];
458
459            crc2 = table[7][(x2 & 0xFF) as usize]
460                ^ table[6][((x2 >> 8) & 0xFF) as usize]
461                ^ table[5][((x2 >> 0x10) & 0xFF) as usize]
462                ^ table[4][((x2 >> 0x18) & 0xFF) as usize]
463                ^ table[3][(hi2 & 0xFF) as usize]
464                ^ table[2][((hi2 >> 8) & 0xFF) as usize]
465                ^ table[1][((hi2 >> 0x10) & 0xFF) as usize]
466                ^ table[0][((hi2 >> 0x18) & 0xFF) as usize];
467
468            crc3 = table[7][(x3 & 0xFF) as usize]
469                ^ table[6][((x3 >> 8) & 0xFF) as usize]
470                ^ table[5][((x3 >> 0x10) & 0xFF) as usize]
471                ^ table[4][((x3 >> 0x18) & 0xFF) as usize]
472                ^ table[3][(hi3 & 0xFF) as usize]
473                ^ table[2][((hi3 >> 8) & 0xFF) as usize]
474                ^ table[1][((hi3 >> 0x10) & 0xFF) as usize]
475                ^ table[0][((hi3 >> 0x18) & 0xFF) as usize];
476
477            p0 = p0.add(8);
478            p1 = p1.add(8);
479            p2 = p2.add(8);
480            p3 = p3.add(8);
481        }
482
483        len -= 8;
484    }
485
486    [!crc0, !crc1, !crc2, !crc3]
487}
488
489/// Compute CRC32C w/ `sse4.2` SIMD ISA, to generate 32-bit crc
490///
491/// **NOTE:** Length of `buf` must be a multiple of 8
492#[cfg(target_arch = "x86_64")]
493#[target_feature(enable = "sse4.2")]
494unsafe fn crc32c_hardware(buf: &TBuf) -> TCrc {
495    // sanity check
496    debug_assert!(buf.len() & 7 == 0, "bytes_buf must be 8 bytes aligned");
497
498    let mut crc: u64 = (!0u32) as u64;
499    let mut len = buf.len();
500    let mut ptr = buf.as_ptr();
501
502    while len > 0 {
503        let word = core::ptr::read_unaligned(ptr as *const u64);
504        crc = core::arch::x86_64::_mm_crc32_u64(crc, word);
505        ptr = ptr.add(8);
506
507        len -= 8;
508    }
509
510    (!crc) as TCrc
511}
512
513/// Compute CRC32C for two bufs in parallel using `Instruction Level Parallelism` w/ `sse4.2` SIMD ISA,
514/// to generate 32-bit crc for each buf (mapped one-to-one)
515///
516/// **NOTE:** Length of each buf must be equal and a multiple of 8
517#[cfg(target_arch = "x86_64")]
518#[target_feature(enable = "sse4.2")]
519unsafe fn crc32c_hardware_2x(buffers: [&TBuf; 2]) -> [TCrc; 2] {
520    let mut len = buffers[0].len();
521
522    // sanity checks
523    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
524    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
525
526    let mut c0: u64 = (!0u32) as u64;
527    let mut c1: u64 = (!0u32) as u64;
528
529    let mut p0 = buffers[0].as_ptr();
530    let mut p1 = buffers[1].as_ptr();
531
532    while len > 0 {
533        let w0 = core::ptr::read_unaligned(p0 as *const u64);
534        let w1 = core::ptr::read_unaligned(p1 as *const u64);
535
536        c0 = core::arch::x86_64::_mm_crc32_u64(c0, w0);
537        c1 = core::arch::x86_64::_mm_crc32_u64(c1, w1);
538
539        p0 = p0.add(8);
540        p1 = p1.add(8);
541        len -= 8;
542    }
543
544    [!(c0 as TCrc), !(c1 as TCrc)]
545}
546
547/// Compute CRC32C for four bufs in parallel using `Instruction Level Parallelism` w/ `sse4.2` SIMD ISA,
548/// to generate 32-bit crc for each buf (mapped one-to-one)
549///
550/// **NOTE:** Length of each buffer must be equal and a multiple of 8
551#[cfg(target_arch = "x86_64")]
552#[target_feature(enable = "sse4.2")]
553unsafe fn crc32c_hardware_4x(buffers: [&TBuf; 4]) -> [TCrc; 4] {
554    let mut len = buffers[0].len();
555
556    // sanity checks
557    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
558    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
559
560    let mut c0: u64 = (!0u32) as u64;
561    let mut c1: u64 = (!0u32) as u64;
562    let mut c2: u64 = (!0u32) as u64;
563    let mut c3: u64 = (!0u32) as u64;
564
565    let mut p0 = buffers[0].as_ptr();
566    let mut p1 = buffers[1].as_ptr();
567    let mut p2 = buffers[2].as_ptr();
568    let mut p3 = buffers[3].as_ptr();
569
570    while len > 0 {
571        let w0 = core::ptr::read_unaligned(p0 as *const u64);
572        let w1 = core::ptr::read_unaligned(p1 as *const u64);
573        let w2 = core::ptr::read_unaligned(p2 as *const u64);
574        let w3 = core::ptr::read_unaligned(p3 as *const u64);
575
576        c0 = core::arch::x86_64::_mm_crc32_u64(c0, w0);
577        c1 = core::arch::x86_64::_mm_crc32_u64(c1, w1);
578        c2 = core::arch::x86_64::_mm_crc32_u64(c2, w2);
579        c3 = core::arch::x86_64::_mm_crc32_u64(c3, w3);
580
581        p0 = p0.add(8);
582        p1 = p1.add(8);
583        p2 = p2.add(8);
584        p3 = p3.add(8);
585
586        len -= 8;
587    }
588
589    [!(c0 as TCrc), !(c1 as TCrc), !(c2 as TCrc), !(c3 as TCrc)]
590}
591
592/// Compute CRC32C w/ ArmV8 `crc` SIMD instruction, to generate 32-bit crc
593///
594/// **NOTE:** Length of `buf` must be a multiple of 8
595#[cfg(target_arch = "aarch64")]
596#[target_feature(enable = "crc")]
597unsafe fn crc32c_hardware(buf: &TBuf) -> TCrc {
598    // sanity check
599    debug_assert!(buf.len() & 7 == 0, "bytes_buf must be 8 bytes aligned");
600
601    let mut crc: TCrc = !0;
602    let mut len = buf.len();
603    let mut ptr = buf.as_ptr();
604
605    while len > 0 {
606        let word: u64 = core::ptr::read_unaligned(ptr as *const u64);
607        crc = core::arch::aarch64::__crc32cd(crc, word);
608        ptr = ptr.add(8);
609
610        len -= 8;
611    }
612
613    !crc
614}
615
616/// Compute CRC32C for two bufs in parallel using `Instruction Level Parallelism` w/ ArmV8 `crc` SIMD instruction,
617/// to generate 32-bit crc for each buf (mapped one-to-one)
618///
619/// **NOTE:** Length of each buf must be equal and a multiple of 8
620#[cfg(target_arch = "aarch64")]
621#[target_feature(enable = "crc")]
622unsafe fn crc32c_hardware_2x(buffers: [&TBuf; 2]) -> [TCrc; 2] {
623    let mut len = buffers[0].len();
624
625    // sanity checks
626    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
627    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
628
629    let mut c0: TCrc = !0;
630    let mut c1: TCrc = !0;
631
632    let mut p0 = buffers[0].as_ptr();
633    let mut p1 = buffers[1].as_ptr();
634
635    while len > 0 {
636        let w0: u64 = core::ptr::read_unaligned(p0 as *const u64);
637        let w1: u64 = core::ptr::read_unaligned(p1 as *const u64);
638
639        c0 = core::arch::aarch64::__crc32cd(c0, w0);
640        c1 = core::arch::aarch64::__crc32cd(c1, w1);
641
642        p0 = p0.add(8);
643        p1 = p1.add(8);
644
645        len -= 8;
646    }
647
648    [!c0, !c1]
649}
650
651/// Compute CRC32C for four bufs in parallel using `Instruction Level Parallelism` w/ ArmV8 `crc` SIMD instruction,
652/// to generate 32-bit crc for each buf (mapped one-to-one)
653///
654/// **NOTE:** Length of each buf must be equal and a multiple of 8
655#[cfg(target_arch = "aarch64")]
656#[target_feature(enable = "crc")]
657unsafe fn crc32c_hardware_4x(buffers: [&TBuf; 4]) -> [TCrc; 4] {
658    let mut len = buffers[0].len();
659
660    // sanity checks
661    debug_assert!(len & 7 == 0, "bytes_buf must be 8 bytes aligned");
662    debug_assert!(buffers.iter().all(|b| b.len() == len), "each buf in bytes_bufs must be of same length");
663
664    let mut c0: TCrc = !0;
665    let mut c1: TCrc = !0;
666    let mut c2: TCrc = !0;
667    let mut c3: TCrc = !0;
668
669    let mut p0 = buffers[0].as_ptr();
670    let mut p1 = buffers[1].as_ptr();
671    let mut p2 = buffers[2].as_ptr();
672    let mut p3 = buffers[3].as_ptr();
673
674    while len > 0 {
675        let w0: u64 = core::ptr::read_unaligned(p0 as *const u64);
676        let w1: u64 = core::ptr::read_unaligned(p1 as *const u64);
677        let w2: u64 = core::ptr::read_unaligned(p2 as *const u64);
678        let w3: u64 = core::ptr::read_unaligned(p3 as *const u64);
679
680        c0 = core::arch::aarch64::__crc32cd(c0, w0);
681        c1 = core::arch::aarch64::__crc32cd(c1, w1);
682        c2 = core::arch::aarch64::__crc32cd(c2, w2);
683        c3 = core::arch::aarch64::__crc32cd(c3, w3);
684
685        p0 = p0.add(8);
686        p1 = p1.add(8);
687        p2 = p2.add(8);
688        p3 = p3.add(8);
689
690        len -= 8;
691    }
692
693    [!c0, !c1, !c2, !c3]
694}
695
696#[cfg(test)]
697mod tests {
698    use super::*;
699
700    fn make_buf(len: usize, seed: u8) -> Vec<u8> {
701        (0..len).map(|i| seed.wrapping_add(i as u8)).collect()
702    }
703
704    #[inline]
705    fn is_simd_available() -> bool {
706        #[cfg(target_arch = "x86_64")]
707        return std::is_x86_feature_detected!("sse4.2");
708
709        #[cfg(target_arch = "aarch64")]
710        return std::arch::is_aarch64_feature_detected!("crc");
711    }
712
713    mod public_api {
714        use super::*;
715
716        #[test]
717        fn ok_known_crc_vectors() {
718            let crc = Crc32C::default();
719            assert_eq!(crc.crc(b"12345678"), 0x6087809A);
720        }
721
722        #[test]
723        fn ok_crc_is_deterministic() {
724            let crc = Crc32C::default();
725
726            let buf = make_buf(0x1000, 0x77);
727
728            let a = crc.crc(&buf);
729            let b = crc.crc(&buf);
730            let c = crc.crc(&buf);
731
732            assert_eq!(a, b);
733            assert_eq!(b, c);
734        }
735
736        #[test]
737        fn ok_different_buffers_have_different_crc() {
738            let crc = Crc32C::default();
739
740            let a = make_buf(0x1000, 0x10);
741            let b = make_buf(0x1000, 0x11);
742
743            assert_ne!(crc.crc(&a), crc.crc(&b));
744        }
745
746        #[test]
747        fn ok_parallel_crc_matches_single() {
748            let crc = Crc32C::default();
749
750            let b0 = make_buf(0x1000, 1);
751            let b1 = make_buf(0x1000, 2);
752            let b2 = make_buf(0x1000, 3);
753            let b3 = make_buf(0x1000, 4);
754
755            let s0 = crc.crc(&b0);
756            let s1 = crc.crc(&b1);
757            let s2 = crc.crc(&b2);
758            let s3 = crc.crc(&b3);
759
760            let p2 = crc.crc_2x([&b0, &b1]);
761            let p4 = crc.crc_4x([&b0, &b1, &b2, &b3]);
762
763            assert_eq!([s0, s1], p2);
764            assert_eq!([s0, s1, s2, s3], p4);
765        }
766
767        #[test]
768        fn ok_zero_buffer_crc() {
769            let crc = Crc32C::default();
770
771            let buf = vec![0u8; 0x1000];
772            let a = crc.crc(&buf);
773            let b = crc.crc(&buf);
774
775            assert_eq!(a, b);
776        }
777    }
778
779    mod hw_sw_consistency {
780        use super::*;
781
782        #[test]
783        fn ok_crc_single_buf() {
784            if !is_simd_available() {
785                return;
786            }
787
788            let buf = make_buf(0x1000, 0x0A);
789
790            let sw = crc32c_slice8(&buf);
791            let hw = unsafe { crc32c_hardware(&buf) };
792            assert_eq!(sw, hw);
793        }
794
795        #[test]
796        fn ok_crc_random_bufs() {
797            for seed in 0..0x20u8 {
798                let buf = make_buf(0x2000, seed);
799
800                let sw = crc32c_slice8(&buf);
801                let hw = unsafe { crc32c_hardware(&buf) };
802                assert_eq!(sw, hw);
803            }
804        }
805
806        #[test]
807        fn ok_crc_buf_2x() {
808            if !is_simd_available() {
809                return;
810            }
811
812            let b0 = make_buf(0x1000, 0x0A);
813            let b1 = make_buf(0x1000, 0x0B);
814
815            let sw = crc32c_slice8_2x([&b0, &b1]);
816            let hw = unsafe { crc32c_hardware_2x([&b0, &b1]) };
817            assert_eq!(sw, hw);
818        }
819
820        #[test]
821        fn ok_crc_buf_4x() {
822            if !is_simd_available() {
823                return;
824            }
825
826            let b0 = make_buf(0x1000, 0x0A);
827            let b1 = make_buf(0x1000, 0x0B);
828            let b2 = make_buf(0x1000, 0x0C);
829            let b3 = make_buf(0x1000, 0x0D);
830
831            let sw = crc32c_slice8_4x([&b0, &b1, &b2, &b3]);
832            let hw = unsafe { crc32c_hardware_4x([&b0, &b1, &b2, &b3]) };
833            assert_eq!(sw, hw);
834        }
835    }
836
837    mod corruption_detection {
838        use super::*;
839
840        #[test]
841        fn ok_single_bit_flip_changes_crc() {
842            let crc = Crc32C::default();
843
844            let mut buf = make_buf(0x1000, 0xAA);
845            let original = crc.crc(&buf);
846            buf[0x1A] ^= 1; // manually corrupt a single bit
847
848            let corrupted = crc.crc(&buf);
849            assert_ne!(original, corrupted);
850        }
851
852        #[test]
853        fn ok_multiple_bit_flips_change_crc() {
854            let crc = Crc32C::default();
855
856            let mut buf = make_buf(0x2000, 0x11);
857            let original = crc.crc(&buf);
858
859            buf[0] ^= 0b0000_0001;
860            buf[0x12A] ^= 0b1000_0000;
861            buf[0x23B] ^= 0b0001_0000;
862            buf[0x100B] ^= 0b0100_0000;
863
864            let corrupted = crc.crc(&buf);
865            assert_ne!(original, corrupted);
866        }
867
868        #[test]
869        fn ok_detects_torn_write_simulation() {
870            let crc = Crc32C::default();
871
872            let mut buf = make_buf(0x1000, 0x55);
873            let original = crc.crc(&buf);
874
875            // simulating partial overwrites
876            for b in &mut buf[0x80..0x100] {
877                *b = 0;
878            }
879
880            let corrupted = crc.crc(&buf);
881            assert_ne!(original, corrupted);
882        }
883
884        #[test]
885        fn ok_detects_random_corruption() {
886            let crc = Crc32C::default();
887
888            let mut buf = make_buf(0x1000, 0x42);
889            let original = crc.crc(&buf);
890
891            for i in (0..buf.len()).step_by(0x5F) {
892                buf[i] ^= 0xFF;
893            }
894
895            let corrupted = crc.crc(&buf);
896            assert_ne!(original, corrupted);
897        }
898
899        #[test]
900        fn ok_every_bit_flip_changes_crc() {
901            let crc = Crc32C::default();
902
903            let mut buf = make_buf(0x40, 0xAB);
904            let base = crc.crc(&buf);
905
906            for i in 0..buf.len() {
907                for bit in 0..8 {
908                    buf[i] ^= 1 << bit;
909                    assert_ne!(base, crc.crc(&buf));
910                    buf[i] ^= 1 << bit;
911                }
912            }
913        }
914    }
915}