crc-fast 1.10.0

//! Provides CRC-32/ISCSI calculations using a fusion of native CLMUL
//! instructions and native CRC calculation instructions on aarch64.
//!
//! https://dougallj.wordpress.com/2022/05/22/faster-crc32-on-the-apple-m1/
//!
//! Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
//! with the help of Claude.ai.
//!
//! Modified as necessary for this Rust implementation.
//!
//! MIT licensed.

#![cfg(target_arch = "aarch64")]

use crate::crc32::fusion::aarch64::{clmul_hi_and_xor, clmul_lo_and_xor};
use core::arch::aarch64::{
    __crc32cb, __crc32cd, veorq_u64, vgetq_lane_u64, vld1q_u64, vmovq_n_u64, vsetq_lane_u64,
};

/// Converted to Rust from the original C code generated by https://github.com/corsix/fast-crc32/
/// with the help of Claude.ai using:
///
/// ./generate -i neon -p crc32c -a v12e_v1
///
/// Modified as necessary for this Rust implementation.
#[inline]
#[target_feature(enable = "crc,aes")]
pub(crate) unsafe fn crc32_iscsi_v12e_v1(mut crc0: u32, mut buf: *const u8, mut len: usize) -> u32 {
    // Align to 8-byte boundary
    while len > 0 && (buf as usize & 7) != 0 {
        crc0 = __crc32cb(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    // Handle 8-byte alignment
    if (buf as usize & 8) != 0 && len >= 8 {
        crc0 = __crc32cd(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    if len >= 192 {
        let end = buf.add(len);
        let limit = buf.add(len - 192);

        // First vector chunk
        let mut x0 = vld1q_u64(buf as *const u64);
        let mut x1 = vld1q_u64(buf.add(16) as *const u64);
        let mut x2 = vld1q_u64(buf.add(32) as *const u64);
        let mut x3 = vld1q_u64(buf.add(48) as *const u64);
        let mut x4 = vld1q_u64(buf.add(64) as *const u64);
        let mut x5 = vld1q_u64(buf.add(80) as *const u64);
        let mut x6 = vld1q_u64(buf.add(96) as *const u64);
        let mut x7 = vld1q_u64(buf.add(112) as *const u64);
        let mut x8 = vld1q_u64(buf.add(128) as *const u64);
        let mut x9 = vld1q_u64(buf.add(144) as *const u64);
        let mut x10 = vld1q_u64(buf.add(160) as *const u64);
        let mut x11 = vld1q_u64(buf.add(176) as *const u64);

        let k_vals: [u64; 2] = [0xa87ab8a8, 0xab7aff2a];
        let mut k = vld1q_u64(k_vals.as_ptr());

        // Create CRC vector and XOR with first vector
        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
        x0 = veorq_u64(crc_vec, x0);
        buf = buf.add(192);

        // Main loop
        while buf <= limit {
            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
            x0 = clmul_hi_and_xor(x0, k, y0);
            let y1 = clmul_lo_and_xor(x1, k, vld1q_u64(buf.add(16) as *const u64));
            x1 = clmul_hi_and_xor(x1, k, y1);
            let y2 = clmul_lo_and_xor(x2, k, vld1q_u64(buf.add(32) as *const u64));
            x2 = clmul_hi_and_xor(x2, k, y2);
            let y3 = clmul_lo_and_xor(x3, k, vld1q_u64(buf.add(48) as *const u64));
            x3 = clmul_hi_and_xor(x3, k, y3);
            let y4 = clmul_lo_and_xor(x4, k, vld1q_u64(buf.add(64) as *const u64));
            x4 = clmul_hi_and_xor(x4, k, y4);
            let y5 = clmul_lo_and_xor(x5, k, vld1q_u64(buf.add(80) as *const u64));
            x5 = clmul_hi_and_xor(x5, k, y5);
            let y6 = clmul_lo_and_xor(x6, k, vld1q_u64(buf.add(96) as *const u64));
            x6 = clmul_hi_and_xor(x6, k, y6);
            let y7 = clmul_lo_and_xor(x7, k, vld1q_u64(buf.add(112) as *const u64));
            x7 = clmul_hi_and_xor(x7, k, y7);
            let y8 = clmul_lo_and_xor(x8, k, vld1q_u64(buf.add(128) as *const u64));
            x8 = clmul_hi_and_xor(x8, k, y8);
            let y9 = clmul_lo_and_xor(x9, k, vld1q_u64(buf.add(144) as *const u64));
            x9 = clmul_hi_and_xor(x9, k, y9);
            let y10 = clmul_lo_and_xor(x10, k, vld1q_u64(buf.add(160) as *const u64));
            x10 = clmul_hi_and_xor(x10, k, y10);
            let y11 = clmul_lo_and_xor(x11, k, vld1q_u64(buf.add(176) as *const u64));
            x11 = clmul_hi_and_xor(x11, k, y11);
            buf = buf.add(192);
        }

        // Reduce x0 ... x11 to just x0
        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo_and_xor(x0, k, x1);
        x0 = clmul_hi_and_xor(x0, k, y0);
        let y2 = clmul_lo_and_xor(x2, k, x3);
        x2 = clmul_hi_and_xor(x2, k, y2);
        let y4 = clmul_lo_and_xor(x4, k, x5);
        x4 = clmul_hi_and_xor(x4, k, y4);
        let y6 = clmul_lo_and_xor(x6, k, x7);
        x6 = clmul_hi_and_xor(x6, k, y6);
        let y8 = clmul_lo_and_xor(x8, k, x9);
        x8 = clmul_hi_and_xor(x8, k, y8);
        let y10 = clmul_lo_and_xor(x10, k, x11);
        x10 = clmul_hi_and_xor(x10, k, y10);

        let k_vals: [u64; 2] = [0x3da6d0cb, 0xba4fc28e];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo_and_xor(x0, k, x2);
        x0 = clmul_hi_and_xor(x0, k, y0);
        let y4 = clmul_lo_and_xor(x4, k, x6);
        x4 = clmul_hi_and_xor(x4, k, y4);
        let y8 = clmul_lo_and_xor(x8, k, x10);
        x8 = clmul_hi_and_xor(x8, k, y8);

        let k_vals: [u64; 2] = [0x740eef02, 0x9e4addf8];
        k = vld1q_u64(k_vals.as_ptr());

        let y0 = clmul_lo_and_xor(x0, k, x4);
        x0 = clmul_hi_and_xor(x0, k, y0);
        x4 = x8;
        let y0 = clmul_lo_and_xor(x0, k, x4);
        x0 = clmul_hi_and_xor(x0, k, y0);

        // Reduce 128 bits to 32 bits, and multiply by x^32
        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
        len = end.offset_from(buf) as usize;
    }

    if len >= 16 {
        // First vector chunk
        let mut x0 = vld1q_u64(buf as *const u64);

        let k_vals: [u64; 2] = [0xf20c0dfe, 0x493c7d27];
        let k = vld1q_u64(k_vals.as_ptr());

        // Create CRC vector and XOR with first vector
        let crc_vec = vsetq_lane_u64(crc0 as u64, vmovq_n_u64(0), 0);
        x0 = veorq_u64(crc_vec, x0);
        buf = buf.add(16);
        len -= 16;

        // Main loop
        while len >= 16 {
            let y0 = clmul_lo_and_xor(x0, k, vld1q_u64(buf as *const u64));
            x0 = clmul_hi_and_xor(x0, k, y0);
            buf = buf.add(16);
            len -= 16;
        }

        // Reduce 128 bits to 32 bits, and multiply by x^32
        crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
        crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
    }

    while len >= 8 {
        crc0 = __crc32cd(crc0, *(buf as *const u64));
        buf = buf.add(8);
        len -= 8;
    }

    while len > 0 {
        crc0 = __crc32cb(crc0, *buf);
        buf = buf.add(1);
        len -= 1;
    }

    crc0
}