byond_crc32/specialized/
pclmulqdq.rs

1#[cfg(target_arch = "x86")]
2use core::arch::x86::*;
3#[cfg(target_arch = "x86_64")]
4use core::arch::x86_64::*;
5
6#[derive(Clone, Copy, Debug, PartialEq, Eq)]
7pub struct State {
8    state: u32,
9}
10
11impl State {
12    #[cfg(not(feature = "std"))]
13    pub fn new(state: u32) -> Option<Self> {
14        if cfg!(all(
15            target_feature = "sse4.1"
16            target_feature = "pclmulqdq",
17        )) {
18            Some(Self { state })
19        } else {
20            None
21        }
22    }
23
24    #[cfg(feature = "std")]
25    pub fn new(state: u32) -> Option<Self> {
26        if is_x86_feature_detected!("pclmulqdq") && is_x86_feature_detected!("sse4.1") {
27            Some(Self { state })
28        } else {
29            None
30        }
31    }
32
33    pub fn update(&mut self, buf: &[u8]) {
34        self.state = unsafe { calculate(self.state, buf) }
35    }
36
37    pub fn as_u32(&self) -> u32 {
38        self.state
39    }
40
41    pub fn reset(&mut self) {
42        self.state = crate::DEFAULT_CRC32;
43    }
44}
45
46const RK01: u64 = 0x0029_5f23_0000_0000;
47const RK02: u64 = 0xfafa_5179_0000_0000;
48const RK03: u64 = 0x5cd8_6bb5_0000_0000;
49const RK04: u64 = 0xaf6f_37a3_0000_0000;
50const RK05: u64 = 0x0029_5f23_0000_0000;
51const RK06: u64 = 0x0000_4455_0000_0000;
52const RK07: u64 = 0x0000_0001_0000_00af;
53const RK08: u64 = 0x0000_0001_0000_00af;
54const RK09: u64 = 0x9bd5_7b5d_0000_0000;
55const RK10: u64 = 0xb7a4_d764_0000_0000;
56const RK11: u64 = 0x1ae0_0042_0000_0000;
57const RK12: u64 = 0xe772_0be6_0000_0000;
58const RK13: u64 = 0x9c7f_c8fe_0000_0000;
59const RK14: u64 = 0x3885_faf8_0000_0000;
60const RK15: u64 = 0xb477_ad71_0000_0000;
61const RK16: u64 = 0x0ac2_ae3d_0000_0000;
62const RK17: u64 = 0x5eae_9dbe_0000_0000;
63const RK18: u64 = 0x784a_4838_0000_0000;
64const RK19: u64 = 0x7d21_bf20_0000_0000;
65const RK20: u64 = 0xfaeb_d3d3_0000_0000;
66
67#[target_feature(enable = "pclmulqdq", enable = "sse4.1")]
68pub unsafe fn calculate(crc: u32, mut data: &[u8]) -> u32 {
69    if data.len() < 16 * 8 * 2 {
70        // This could be handled in intrinsics, but this seems fine for now.
71        return crate::baseline::slice_by_16(crc, data);
72    }
73
74    let crc = _mm_set_epi32(crc as i32, 0x0000, 0x0000, 0x0000);
75    // Shuffle mask for byte-swapping 16 bytes.
76    let smask = _mm_set_epi8(
77        0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
78    );
79
80    // Load initial 128B of data and XOR in the initial CRC.
81    let mut x7 = get(&mut data, smask);
82    let mut x6 = get(&mut data, smask);
83    let mut x5 = get(&mut data, smask);
84    let mut x4 = get(&mut data, smask);
85    let mut x3 = get(&mut data, smask);
86    let mut x2 = get(&mut data, smask);
87    let mut x1 = get(&mut data, smask);
88    let mut x0 = get(&mut data, smask);
89    x7 = _mm_xor_si128(x7, crc);
90
91    let k3k4 = _mm_set_epi64x(RK04 as i64, RK03 as i64);
92    while data.len() >= 128 {
93        x7 = reduce128(x7, get(&mut data, smask), k3k4);
94        x6 = reduce128(x6, get(&mut data, smask), k3k4);
95        x5 = reduce128(x5, get(&mut data, smask), k3k4);
96        x4 = reduce128(x4, get(&mut data, smask), k3k4);
97        x3 = reduce128(x3, get(&mut data, smask), k3k4);
98        x2 = reduce128(x2, get(&mut data, smask), k3k4);
99        x1 = reduce128(x1, get(&mut data, smask), k3k4);
100        x0 = reduce128(x0, get(&mut data, smask), k3k4);
101    }
102
103    let k1k2 = _mm_set_epi64x(RK02 as i64, RK01 as i64);
104    let mut x = reduce128(x7, x0, _mm_set_epi64x(RK10 as i64, RK09 as i64));
105    x = reduce128(x6, x, _mm_set_epi64x(RK12 as i64, RK11 as i64));
106    x = reduce128(x5, x, _mm_set_epi64x(RK14 as i64, RK13 as i64));
107    x = reduce128(x4, x, _mm_set_epi64x(RK16 as i64, RK15 as i64));
108    x = reduce128(x3, x, _mm_set_epi64x(RK18 as i64, RK17 as i64));
109    x = reduce128(x2, x, _mm_set_epi64x(RK20 as i64, RK19 as i64));
110    x = reduce128(x1, x, k1k2);
111
112    while data.len() >= 16 {
113        x = reduce128(x, get(&mut data, smask), k1k2);
114    }
115
116    // Reduce 128b to 64b.
117    let k5k6 = _mm_set_epi64x(RK06 as i64, RK05 as i64);
118    x = _mm_xor_si128(_mm_clmulepi64_si128(x, k5k6, 0x01), _mm_slli_si128(x, 8));
119    x = _mm_xor_si128(
120        _mm_clmulepi64_si128(_mm_srli_si128(x, 12), k5k6, 0x10),
121        _mm_and_si128(x, _mm_set_epi32(0, !0, !0, !0)),
122    );
123
124    // Barrett reduction, 64b to 32b.
125    let k7k8 = _mm_set_epi64x(RK08 as i64, RK07 as i64);
126    let t1 = _mm_slli_si128(_mm_clmulepi64_si128(x, k7k8, 0x01), 4);
127    let t2 = _mm_slli_si128(_mm_clmulepi64_si128(t1, k7k8, 0x11), 4);
128    let crc = _mm_extract_epi32(_mm_xor_si128(x, t2), 1) as u32;
129
130    if data.is_empty() {
131        crc
132    } else {
133        // We could use intrinsics for the remaining data, but this seems fine for now.
134        // Less than 16B remaining, so slice-by-1 instead of slice-by-16.
135        crate::baseline::slice_by_1(crc, data)
136    }
137}
138
139#[inline(always)]
140unsafe fn reduce128(a: __m128i, b: __m128i, keys: __m128i) -> __m128i {
141    let t1 = _mm_clmulepi64_si128(a, keys, 0x00);
142    let t2 = _mm_clmulepi64_si128(a, keys, 0x11);
143    _mm_xor_si128(_mm_xor_si128(b, t1), t2)
144}
145
146#[inline(always)]
147unsafe fn get(data: &mut &[u8], smask: __m128i) -> __m128i {
148    let r = _mm_shuffle_epi8(_mm_loadu_si128(data.as_ptr() as *const __m128i), smask);
149    *data = &data[16..];
150    r
151}
152
153#[cfg(test)]
154mod test {
155    use quickcheck_macros::quickcheck;
156
157    #[quickcheck]
158    fn check_against_baseline(init: u32, chunks: Vec<(Vec<u8>, usize)>) -> bool {
159        let mut baseline = crate::baseline::State::new(init);
160        let mut pclmulqdq = super::State::new(init).expect("not supported");
161        for (chunk, mut offset) in chunks {
162            // simulate random alignments by offsetting the slice by up to 15 bytes
163            offset &= 0xF;
164            if chunk.len() <= offset {
165                baseline.update(&chunk);
166                pclmulqdq.update(&chunk);
167            } else {
168                baseline.update(&chunk[offset..]);
169                pclmulqdq.update(&chunk[offset..]);
170            }
171        }
172        pclmulqdq.as_u32() == baseline.as_u32()
173    }
174}