1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.
#![allow(dead_code)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
use crate::enums::Reflector;
use crate::CrcParams;
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
use crate::structs::CrcState;
use core::ops::BitXor;
/// Marker trait for CRC width
pub trait CrcWidth {
/// The width in bits
const WIDTH: u32;
/// The natural value type for this width
type Value: Copy + BitXor<Output = Self::Value>;
}
pub(crate) trait CrcCalculator {
fn update(data: &[u8], state: u64, params: &CrcParams) -> u64 {
Self::calculate(state, data, params)
}
fn checksum(data: &[u8], params: &CrcParams) -> u64 {
Self::calculate(params.init, data, params) ^ params.xorout
}
fn calculate(state: u64, data: &[u8], params: &CrcParams) -> u64;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
/// Trait defining architecture-specific SIMD operations for CRC calculation
pub trait ArchOps: Sized + Copy + Clone {
/// The SIMD vector type used by this architecture
type Vector;
/// Process aligned blocks using potentially accelerated SIMD operations
///
/// Returns true if the operation was handled by the accelerated path (for example,
/// using VPCLMULQDQ)
unsafe fn process_enhanced_simd_blocks<W: EnhancedCrcWidth>(
&self,
_state: &mut CrcState<Self::Vector>,
_first: &[Self::Vector; 8],
_rest: &[[Self::Vector; 8]],
_reflector: &Reflector<Self::Vector>,
_keys: &[u64; 23],
) -> bool
where
Self::Vector: Copy,
{
// Default implementation just returns false
// indicating the non-enhanced algorithm should be used
false
}
/// Create a SIMD vector from a u64 pair
///
/// # Safety
/// May use native CPU features
unsafe fn create_vector_from_u64_pair(
&self,
high: u64,
low: u64,
reflected: bool,
) -> Self::Vector;
/// Create a SIMD vector from a u64 pair without reflection
///
/// TODO: I have no idea (yet) why CRC-32 doesn't use reflection, but CRC-64 does.
///
/// # Safety
/// May use native CPU features
unsafe fn create_vector_from_u64_pair_non_reflected(&self, high: u64, low: u64)
-> Self::Vector;
/// Create a SIMD vector with a single u64 value
///
/// # Safety
/// May use native CPU features
unsafe fn create_vector_from_u64(&self, value: u64, high: bool) -> Self::Vector;
/// Extract two u64 values from a SIMD vector
///
/// # Safety
/// May use native CPU features
unsafe fn extract_u64s(&self, vector: Self::Vector) -> [u64; 2];
/// Extract two polynomial values (for carryless multiplication)
///
/// # Safety
/// May use native CPU features
unsafe fn extract_poly64s(&self, vector: Self::Vector) -> [u64; 2];
/// XOR two SIMD vectors
///
/// # Safety
/// May use native CPU features
unsafe fn xor_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// Load bytes from memory into a SIMD vector
///
/// # Safety
/// May use native CPU features
unsafe fn load_bytes(&self, ptr: *const u8) -> Self::Vector;
/// Load aligned bytes from memory
///
/// # Safety
/// May use native CPU features
unsafe fn load_aligned(&self, ptr: *const [u64; 2]) -> Self::Vector;
//unsafe fn load_aligned(&self, ptr: &[u64]) -> Self::Vector;
//unsafe fn load_aligned_const(&self, ptr: *const [u64; 2]) -> Self::Vector;
/// Shuffle/permute bytes according to a mask
///
/// # Safety
/// May use native CPU features
unsafe fn shuffle_bytes(&self, data: Self::Vector, mask: Self::Vector) -> Self::Vector;
/// Blend two vectors using a mask (select from a or b based on mask bits)
///
/// # Safety
/// May use native CPU features
unsafe fn blend_vectors(
&self,
a: Self::Vector,
b: Self::Vector,
mask: Self::Vector,
) -> Self::Vector;
/// Shift a vector left by 8 bytes
///
/// # Safety
/// May use native CPU features
unsafe fn shift_left_8(&self, vector: Self::Vector) -> Self::Vector;
/// Create a vector with all bytes set to the same value
///
/// # Safety
/// May use native CPU features
unsafe fn set_all_bytes(&self, value: u8) -> Self::Vector;
/// Create a comparison mask (for blending operations)
///
/// # Safety
/// May use native CPU features
unsafe fn create_compare_mask(&self, vector: Self::Vector) -> Self::Vector;
/// AND two vectors
///
/// # Safety
/// May use native CPU features
unsafe fn and_vectors(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// Shift a vector right by 32 bits (4 bytes)
///
/// # Safety
/// May use native CPU features
unsafe fn shift_right_32(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector left by 32 bits (4 bytes)
///
/// # Safety
/// May use native CPU features
unsafe fn shift_left_32(&self, vector: Self::Vector) -> Self::Vector;
/// Create a SIMD vector with a single u32 value
///
/// # Safety
/// May use native CPU features
unsafe fn create_vector_from_u32(&self, value: u32, high: bool) -> Self::Vector;
/// Shift a vector left by 4 bytes (32 bits)
///
/// # Safety
/// May use native CPU features
unsafe fn shift_left_4(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 4 bytes (32 bits)
///
/// # Safety
/// May use native CPU features
unsafe fn shift_right_4(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 8 bytes (64 bits)
///
/// # Safety
/// May use native CPU features
unsafe fn shift_right_8(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 5 bytes
unsafe fn shift_right_5(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 6 bytes
unsafe fn shift_right_6(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 7 bytes
unsafe fn shift_right_7(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector right by 12 bytes
unsafe fn shift_right_12(&self, vector: Self::Vector) -> Self::Vector;
/// Shift a vector left by 12 bytes
unsafe fn shift_left_12(&self, vector: Self::Vector) -> Self::Vector;
/// Perform carryless multiplication with immediate value 0x00 (low 64 bits of both vectors)
unsafe fn carryless_mul_00(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// Perform carryless multiplication with immediate value 0x01 (low 64 bits of a, high 64 bits of b)
unsafe fn carryless_mul_01(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// Perform carryless multiplication with immediate value 0x10 (high 64 bits of a, low 64 bits of b)
unsafe fn carryless_mul_10(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// Perform carryless multiplication with immediate value 0x11 (high 64 bits of both vectors)
unsafe fn carryless_mul_11(&self, a: Self::Vector, b: Self::Vector) -> Self::Vector;
/// XOR three vectors together: a XOR b XOR c
/// Uses native XOR3 instructions when available, falls back to two XOR operations otherwise
unsafe fn xor3_vectors(
&self,
a: Self::Vector,
b: Self::Vector,
c: Self::Vector,
) -> Self::Vector;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
/// Enhanced CrcWidth trait with additional operations for generic CRC implementation
pub trait EnhancedCrcWidth: CrcWidth {
/// Load constants specific to CRC width
fn load_constants(reflected: bool) -> [[u64; 2]; 4];
/// Create a CRC state with the initial value positioned correctly for the width
unsafe fn create_state<T: ArchOps>(
value: Self::Value,
reflected: bool,
ops: &T,
) -> CrcState<T::Vector>
where
T::Vector: Copy;
/// Extract the final CRC result from a SIMD vector
unsafe fn extract_result<T: ArchOps>(
vector: T::Vector,
reflected: bool,
ops: &T,
) -> Self::Value
where
T::Vector: Copy;
/// Perform width-specific folding operations using CLMUL and two XOR operations (or one XOR3)
unsafe fn fold_16<T: ArchOps>(
state: &mut CrcState<T::Vector>,
coefficient: T::Vector,
data_to_xor: T::Vector,
ops: &T,
) where
T::Vector: Copy;
/// Fold width-specific number of bytes
unsafe fn fold_width<T: ArchOps>(state: &mut CrcState<T::Vector>, high: u64, low: u64, ops: &T)
where
T::Vector: Copy;
/// Width-specific Barrett reduction
unsafe fn barrett_reduction<T: ArchOps>(
state: &CrcState<T::Vector>,
poly: u64,
mu: u64,
ops: &T,
) -> Self::Value
where
T::Vector: Copy;
/// Create a coefficient vector for folding operations
unsafe fn create_coefficient<T: ArchOps>(
high: u64,
low: u64,
reflected: bool,
ops: &T,
) -> T::Vector
where
T::Vector: Copy;
/// Perform final reduction for the specific width
unsafe fn perform_final_reduction<T: ArchOps>(
state: T::Vector,
reflected: bool,
keys: &[u64; 23],
ops: &T,
) -> Self::Value
where
T::Vector: Copy;
/// Get the appropriate shuffle table pointer and offset for handling last bytes
fn get_last_bytes_table_ptr(reflected: bool, remaining_len: usize) -> (*const u8, usize);
}