crc_fast/crc32/
algorithm.rs

1// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
2
3//! This module provides the CRC-32 algorithm implementations for areas where it differs from
4//! CRC-64.
5
6#![cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
7
8use crate::algorithm;
9use crate::consts::CRC_CHUNK_SIZE;
10use crate::crc32::consts::{PSHUFB_SHF_TABLE_FORWARD, PSHUFB_SHF_TABLE_REVERSE, SIMD_CONSTANTS};
11use crate::enums::Reflector;
12use crate::structs::CrcState;
13use crate::traits::{ArchOps, EnhancedCrcWidth};
14
15impl EnhancedCrcWidth for crate::structs::Width32 {
16    #[inline(always)]
17    fn load_constants(reflected: bool) -> [[u64; 2]; 4] {
18        if reflected {
19            // Constants for reflected CRC-32
20            [
21                [0x08090a0b0c0d0e0f, 0x0001020304050607], // smask
22                [0x8080808080808080, 0x8080808080808080], // mask1
23                [0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF], // mask2 reverse
24                [0x0000000000000000, 0x0000000000000000], // unused in CRC32
25            ]
26        } else {
27            // Constants for non-reflected CRC-32
28            [
29                [0x08090a0b0c0d0e0f, 0x0001020304050607], // smask
30                [0x8080808080808080, 0x8080808080808080], // mask1
31                [0xffffffffffffffff, 0x00000000ffffffff], // mask2 forward
32                [0x0000000000000000, 0x0000000000000000], // unused in CRC32
33            ]
34        }
35    }
36
37    #[inline(always)]
38    unsafe fn create_state<T: ArchOps>(
39        value: Self::Value,
40        reflected: bool,
41        ops: &T,
42    ) -> CrcState<T::Vector>
43    where
44        T::Vector: Copy,
45    {
46        let vector = if reflected {
47            // For reflected mode, state goes in the low 32 bits
48            ops.create_vector_from_u32(value, false)
49        } else {
50            // For non-reflected mode, state goes in high 32 bits of the
51            // high 64-bit part of the 128-bit register (need to shift 12 bytes)
52            ops.create_vector_from_u32(value, true)
53        };
54
55        CrcState {
56            value: vector,
57            reflected,
58        }
59    }
60
61    #[inline(always)]
62    unsafe fn extract_result<T: ArchOps>(vector: T::Vector, reflected: bool, ops: &T) -> Self::Value
63    where
64        T::Vector: Copy,
65    {
66        // Extract u64s from the vector
67        let u64s = ops.extract_u64s(vector);
68
69        if reflected {
70            // In reflected mode, the result is in the low 32 bits of the low 64 bits
71            u64s[0] as u32
72        } else {
73            // In non-reflected mode, the result is in the high 32 bits of the low 64 bits
74            (u64s[1] >> 32) as u32
75        }
76    }
77
78    #[inline(always)]
79    unsafe fn fold_16<T: ArchOps>(
80        state: &mut CrcState<T::Vector>,
81        coeff: T::Vector,
82        data_to_xor: T::Vector,
83        ops: &T,
84    ) where
85        T::Vector: Copy,
86    {
87        // For CRC-32, we need to handle the 32-bit sections of each 64-bit value
88        let (h, l) = if state.reflected {
89            // In reflected mode, multiply using lower and upper 32 bits
90            (
91                ops.carryless_mul_10(state.value, coeff),
92                ops.carryless_mul_01(state.value, coeff),
93            )
94        } else {
95            // For non-reflected mode, we multiply using upper and lower 32 bits
96            (
97                ops.carryless_mul_00(state.value, coeff), // 000h in assembly
98                ops.carryless_mul_11(state.value, coeff), // 011h in assembly
99            )
100        };
101
102        state.value = ops.xor3_vectors(h, l, data_to_xor);
103    }
104
105    /// CRC-32 specific implementation for folding 8 bytes to 4 bytes
106    #[inline(always)]
107    unsafe fn fold_width<T: ArchOps>(state: &mut CrcState<T::Vector>, high: u64, low: u64, ops: &T)
108    where
109        T::Vector: Copy,
110    {
111        let coeff_vector_low = ops.create_vector_from_u64_pair_non_reflected(0, low);
112        let coeff_vector_high = ops.create_vector_from_u64_pair_non_reflected(high, 0);
113
114        state.value = if state.reflected {
115            ops.xor_vectors(
116                ops.carryless_mul_00(state.value, coeff_vector_low),
117                ops.shift_right_8(state.value),
118            )
119        } else {
120            ops.xor_vectors(
121                ops.carryless_mul_01(state.value, coeff_vector_low),
122                ops.shift_left_8(state.value),
123            )
124        };
125
126        let (clmul, masked) = if state.reflected {
127            let mask2 = ops.load_aligned(&[0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF]);
128            let masked = ops.and_vectors(state.value, mask2);
129            let shifted = ops.shift_left_12(state.value);
130            let clmul = ops.carryless_mul_11(shifted, coeff_vector_high);
131
132            (clmul, masked)
133        } else {
134            let mask2 = ops.load_aligned(&[0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF]);
135            let masked = ops.and_vectors(state.value, mask2);
136            let shifted = ops.shift_right_12(state.value);
137            let clmul = ops.carryless_mul_10(shifted, coeff_vector_high);
138
139            (clmul, masked)
140        };
141
142        state.value = ops.xor_vectors(clmul, masked);
143    }
144
145    #[inline(always)]
146    unsafe fn barrett_reduction<T: ArchOps>(
147        state: &CrcState<T::Vector>,
148        poly: u64,
149        mu: u64,
150        ops: &T,
151    ) -> Self::Value
152    where
153        T::Vector: Copy,
154    {
155        let x = state.value;
156        let mu_poly = ops.create_vector_from_u64_pair_non_reflected(poly, mu);
157
158        if state.reflected {
159            let clmul1 = ops.carryless_mul_00(x, mu_poly);
160            let clmul2 = ops.carryless_mul_10(clmul1, mu_poly);
161            let xorred = ops.xor_vectors(x, clmul2);
162
163            ops.extract_u64s(xorred)[1] as u32
164        } else {
165            let clmul1 = ops.shift_left_4(ops.carryless_mul_01(x, mu_poly));
166            let clmul2_shifted = ops.shift_left_4(ops.carryless_mul_11(clmul1, mu_poly));
167            let final_xor = ops.xor_vectors(clmul2_shifted, x);
168
169            (ops.extract_u64s(final_xor)[0] >> 32) as u32
170        }
171    }
172
173    #[inline(always)]
174    unsafe fn create_coefficient<T: ArchOps>(
175        high: u64,
176        low: u64,
177        _reflected: bool,
178        ops: &T,
179    ) -> T::Vector
180    where
181        T::Vector: Copy,
182    {
183        // CRC-32 uses non-reflected coefficient creation
184        ops.create_vector_from_u64_pair_non_reflected(high, low)
185    }
186
187    #[inline(always)]
188    unsafe fn perform_final_reduction<T: ArchOps>(
189        state: T::Vector,
190        reflected: bool,
191        keys: [u64; 23],
192        ops: &T,
193    ) -> Self::Value
194    where
195        T::Vector: Copy,
196    {
197        let mut state = CrcState {
198            value: state,
199            reflected,
200        };
201
202        // Fold 16 bytes into 8 bytes, then 8 bytes into 4 bytes
203        Self::fold_width(&mut state, keys[6], keys[5], ops);
204
205        // Perform Barrett reduction to finalize
206        Self::barrett_reduction(&state, keys[8], keys[7], ops)
207    }
208
209    #[inline(always)]
210    fn get_last_bytes_table_ptr(reflected: bool, remaining_len: usize) -> (*const u8, usize) {
211        use crate::crc32::consts::{PSHUFB_SHF_TABLE_FORWARD, PSHUFB_SHF_TABLE_REVERSE};
212
213        if reflected {
214            // For reflected mode
215            let base_ptr = &PSHUFB_SHF_TABLE_REVERSE as *const _ as *const u8;
216            let offset = remaining_len;
217
218            (base_ptr, offset)
219        } else {
220            // For non-reflected mode
221            let base_ptr = &PSHUFB_SHF_TABLE_FORWARD as *const _ as *const u8;
222            let offset = 16 - remaining_len;
223
224            (base_ptr, offset)
225        }
226    }
227}
228
229/// Process inputs smaller than 16 bytes
230#[inline]
231#[cfg_attr(
232    any(target_arch = "x86", target_arch = "x86_64"),
233    target_feature(enable = "ssse3,sse4.1,pclmulqdq")
234)]
235#[cfg_attr(target_arch = "aarch64", target_feature(enable = "aes"))]
236pub(crate) unsafe fn process_0_to_15<T: ArchOps, W: EnhancedCrcWidth>(
237    data: &[u8],
238    state: &mut CrcState<T::Vector>,
239    reflector: &Reflector<T::Vector>,
240    keys: [u64; 23],
241    ops: &T,
242) -> W::Value
243where
244    T::Vector: Copy,
245{
246    let mut buffer = [0u8; CRC_CHUNK_SIZE];
247    if state.reflected {
248        buffer[CRC_CHUNK_SIZE - data.len()..].copy_from_slice(data);
249    } else {
250        buffer[..data.len()].copy_from_slice(data);
251    }
252
253    let len = data.len() as i32;
254    let base = &PSHUFB_SHF_TABLE_REVERSE as *const _ as *const u8;
255
256    let xmm7 = if state.reflected {
257        let data = ops.load_bytes(buffer.as_ptr());
258        let mask1 = ops.load_aligned(&SIMD_CONSTANTS[1]);
259
260        let ptr = base.add(if len < 4 {
261            8 + len as usize
262        } else {
263            len as usize
264        });
265        let mask = ops.load_bytes(ptr);
266        let modified_mask = ops.xor_vectors(mask, mask1);
267        let shuffled_crc = ops.shuffle_bytes(state.value, modified_mask);
268
269        ops.xor_vectors(
270            if len < 4 {
271                ops.shift_right_8(data)
272            } else {
273                data
274            },
275            shuffled_crc,
276        )
277    } else {
278        let data_arr = ops.load_bytes(buffer.as_ptr());
279        let reflected_data = algorithm::reflect_bytes(reflector, data_arr, ops);
280        let data_with_crc = ops.xor_vectors(reflected_data, state.value);
281
282        if len < 4 {
283            let result = match len {
284                3 => ops.shift_right_5(data_with_crc),
285                2 => ops.shift_right_6(data_with_crc),
286                1 => ops.shift_right_7(data_with_crc),
287                _ => data_with_crc,
288            };
289
290            return W::barrett_reduction(
291                &CrcState {
292                    value: result,
293                    reflected: false,
294                },
295                keys[8],
296                keys[7],
297                ops,
298            );
299        }
300
301        let base = &PSHUFB_SHF_TABLE_FORWARD as *const _ as *const u8;
302        let ptr = base.add(16 - len as usize);
303        let x0 = ops.load_bytes(ptr);
304        let mask1 = ops.load_aligned(&SIMD_CONSTANTS[1]);
305        let x0 = ops.xor_vectors(x0, mask1);
306
307        if len < 8 {
308            ops.shuffle_bytes(data_with_crc, x0)
309        } else {
310            let mut xmm7 = ops.load_bytes(buffer.as_ptr());
311            if let Reflector::ForwardReflector { smask } = reflector {
312                xmm7 = ops.shuffle_bytes(xmm7, *smask);
313            }
314            xmm7 = ops.xor_vectors(xmm7, state.value);
315            let ptr = base.add(16 - len as usize);
316            let x0 = ops.load_bytes(ptr);
317            let xmm0 = ops.xor_vectors(x0, mask1);
318
319            ops.shuffle_bytes(xmm7, xmm0)
320        }
321    };
322
323    if len >= 4 {
324        return W::perform_final_reduction(xmm7, state.reflected, keys, ops);
325    }
326
327    let final_state = CrcState {
328        value: xmm7,
329        reflected: state.reflected,
330    };
331
332    W::barrett_reduction(&final_state, keys[8], keys[7], ops)
333}