crc-fast 1.10.0

//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
//! instructions and native CRC calculation instructions on aarch64.
//!
//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
//!
//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
//! with the help of Claude.ai.
//!
//! Modified as necessary for this Rust implementation.
//!
//! MIT licensed.

#![cfg(target_arch = "aarch64")]

use crate::crc32::fusion::aarch64::{clmul_hi, clmul_lo, clmul_scalar};
use core::arch::aarch64::{
    __crc32cb, __crc32cd, __crc32cw, uint64x2_t, veor3q_u64, veorq_u64, vgetq_lane_u64, vld1q_u64,
    vmov_n_u64, vmull_p8, vreinterpret_p8_u64, vreinterpretq_u64_p16,
};

/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
/// with the help of Claude.ai using:
///
/// ./generate -i neon_eor3 -p crc32c -a v9s3x2e_s3
///
/// Modified as necessary for this Rust implementation.
#[inline]
#[target_feature(enable = "crc,aes,sha3")]
pub(crate) unsafe fn crc32_iscsi_eor3_v9s3x2e_s3(
    mut crc0: u32,
    mut buf: *const u8,
    mut len: usize,
) -> u32 {
    // Align to 8-byte boundary
    while len > 0 && (buf as usize & 7) != 0 {
        crc0 = __crc32cb(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    // Handle 8-byte alignment
    if (buf as usize & 8) != 0 && len >= 8 {
        crc0 = __crc32cd(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    if len >= 192 {
        let end = buf.add(len);
        let blk = len / 192;
        let klen = blk * 16;
        let buf2 = buf.add(klen * 3);
        let limit = buf.add(klen).sub(32);
        let mut crc1 = 0u32;
        let mut crc2 = 0u32;

        // First vector chunk
        let mut x0 = vld1q_u64(buf2 as *const u64);
        let mut x1 = vld1q_u64(buf2.add(16) as *const u64);
        let mut x2 = vld1q_u64(buf2.add(32) as *const u64);
        let mut x3 = vld1q_u64(buf2.add(48) as *const u64);
        let mut x4 = vld1q_u64(buf2.add(64) as *const u64);
        let mut x5 = vld1q_u64(buf2.add(80) as *const u64);
        let mut x6 = vld1q_u64(buf2.add(96) as *const u64);
        let mut x7 = vld1q_u64(buf2.add(112) as *const u64);
        let mut x8 = vld1q_u64(buf2.add(128) as *const u64);

        let k_vals: [u64; 2] = [0x7e908048, 0xc96cfdc0];
        let mut k = vld1q_u64(k_vals.as_ptr());
        let mut buf2 = buf2.add(144);

        // Main loop
        while buf <= limit {
            let y0 = clmul_lo(x0, k);
            x0 = clmul_hi(x0, k);
            let y1 = clmul_lo(x1, k);
            x1 = clmul_hi(x1, k);
            let y2 = clmul_lo(x2, k);
            x2 = clmul_hi(x2, k);
            let y3 = clmul_lo(x3, k);
            x3 = clmul_hi(x3, k);
            let y4 = clmul_lo(x4, k);
            x4 = clmul_hi(x4, k);
            let y5 = clmul_lo(x5, k);
            x5 = clmul_hi(x5, k);
            let y6 = clmul_lo(x6, k);
            x6 = clmul_hi(x6, k);
            let y7 = clmul_lo(x7, k);
            x7 = clmul_hi(x7, k);
            let y8 = clmul_lo(x8, k);
            x8 = clmul_hi(x8, k);

            x0 = veor3q_u64(x0, y0, vld1q_u64(buf2 as *const u64));
            x1 = veor3q_u64(x1, y1, vld1q_u64(buf2.add(16) as *const u64));
            x2 = veor3q_u64(x2, y2, vld1q_u64(buf2.add(32) as *const u64));
            x3 = veor3q_u64(x3, y3, vld1q_u64(buf2.add(48) as *const u64));
            x4 = veor3q_u64(x4, y4, vld1q_u64(buf2.add(64) as *const u64));
            x5 = veor3q_u64(x5, y5, vld1q_u64(buf2.add(80) as *const u64));
            x6 = veor3q_u64(x6, y6, vld1q_u64(buf2.add(96) as *const u64));
            x7 = veor3q_u64(x7, y7, vld1q_u64(buf2.add(112) as *const u64));
            x8 = veor3q_u64(x8, y8, vld1q_u64(buf2.add(128) as *const u64));

            crc0 = __crc32cd(crc0, *(buf as *const u64));
            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
            crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
            crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
            crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));

            buf = buf.add(16);
            buf2 = buf2.add(144);
        }

        // Reduce x0 ... x8 to just x0
        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo(x0, k);
        x0 = clmul_hi(x0, k);
        x0 = veor3q_u64(x0, y0, x1);
        x1 = x2;
        x2 = x3;
        x3 = x4;
        x4 = x5;
        x5 = x6;
        x6 = x7;
        x7 = x8;

        let y0 = clmul_lo(x0, k);
        x0 = clmul_hi(x0, k);
        let y2 = clmul_lo(x2, k);
        x2 = clmul_hi(x2, k);
        let y4 = clmul_lo(x4, k);
        x4 = clmul_hi(x4, k);
        let y6 = clmul_lo(x6, k);
        x6 = clmul_hi(x6, k);

        x0 = veor3q_u64(x0, y0, x1);
        x2 = veor3q_u64(x2, y2, x3);
        x4 = veor3q_u64(x4, y4, x5);
        x6 = veor3q_u64(x6, y6, x7);

        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo(x0, k);
        x0 = clmul_hi(x0, k);
        let y4 = clmul_lo(x4, k);
        x4 = clmul_hi(x4, k);

        x0 = veor3q_u64(x0, y0, x2);
        x4 = veor3q_u64(x4, y4, x6);

        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo(x0, k);
        x0 = clmul_hi(x0, k);
        x0 = veor3q_u64(x0, y0, x4);

        // Final scalar chunk
        crc0 = __crc32cd(crc0, *(buf as *const u64));
        crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
        crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
        crc0 = __crc32cd(crc0, *(buf.add(8) as *const u64));
        crc1 = __crc32cd(crc1, *(buf.add(klen + 8) as *const u64));
        crc2 = __crc32cd(crc2, *(buf.add(klen * 2 + 8) as *const u64));

        let vc0 = crc_shift_iscsi(crc0, klen * 2 + blk * 144);
        let vc1 = crc_shift_iscsi(crc1, klen + blk * 144);
        let vc2 = crc_shift_iscsi(crc2, blk * 144);
        let vc = vgetq_lane_u64(veor3q_u64(vc0, vc1, vc2), 0);

        // Reduce 128 bits to 32 bits, and multiply by x^32
        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
        crc0 = __crc32cd(crc0, vc ^ vgetq_lane_u64(x0, 1));

        buf = buf2;
        len = end.offset_from(buf) as usize;
    }

    if len >= 32 {
        let klen = ((len - 8) / 24) * 8;
        let mut crc1 = 0u32;
        let mut crc2 = 0u32;

        // Main loop
        loop {
            crc0 = __crc32cd(crc0, *(buf as *const u64));
            crc1 = __crc32cd(crc1, *(buf.add(klen) as *const u64));
            crc2 = __crc32cd(crc2, *(buf.add(klen * 2) as *const u64));
            buf = buf.add(8);
            len -= 24;
            if len < 32 {
                break;
            }
        }

        let vc0 = crc_shift_iscsi(crc0, klen * 2 + 8);
        let vc1 = crc_shift_iscsi(crc1, klen + 8);
        let vc = vgetq_lane_u64(veorq_u64(vc0, vc1), 0);

        // Final 8 bytes
        buf = buf.add(klen * 2);
        crc0 = crc2;
        crc0 = __crc32cd(crc0, *(buf as *const u64) ^ vc);
        buf = buf.add(8);
        len -= 8;
    }

    while len >= 8 {
        crc0 = __crc32cd(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    while len > 0 {
        crc0 = __crc32cb(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    crc0
}

#[inline]
#[target_feature(enable = "aes")]
unsafe fn crc_shift_iscsi(crc: u32, nbytes: usize) -> uint64x2_t {
    clmul_scalar(crc, xnmodp_crc32_iscsi((nbytes * 8 - 33) as u64))
}

// x^n mod P, in log(n) time
#[inline]
#[target_feature(enable = "crc,aes")]
unsafe fn xnmodp_crc32_iscsi(mut n: u64) -> u32 {
    let mut stack = !1u64;
    let mut acc: u32;
    let mut low: u32;

    while n > 191 {
        stack = (stack << 1) + (n & 1);
        n = (n >> 1) - 16;
    }
    stack = !stack;
    acc = 0x80000000u32 >> (n & 31);
    n >>= 5;

    while n > 0 {
        // ARM CRC32 instruction
        acc = __crc32cw(acc, 0);
        n -= 1;
    }

    while {
        low = (stack & 1) as u32;
        stack >>= 1;
        stack != 0
    } {
        // Convert to polynomial type and square it
        let x = vreinterpret_p8_u64(vmov_n_u64(acc as u64));
        let squared = vmull_p8(x, x);
        let y = vgetq_lane_u64(vreinterpretq_u64_p16(squared), 0);
        acc = __crc32cd(0, y << low);
    }
    acc
}