crc-fast 1.10.0

// Copyright 2025 Don MacAskill. Licensed under MIT or Apache-2.0.

//! This module calculates the keys needed for CRC calculations using PCLMULQDQ.
//!
//! Docs generated by Claude.ai
//!
//! # Overview
//!
//! This implements key generation for hardware-accelerated CRC computation using Intel's
//! PCLMULQDQ instruction (carryless multiplication). The keys enable a "fold-by-8" algorithm
//! that processes data in parallel chunks.
//!
//! # The PCLMULQDQ Algorithm
//!
//! Traditional CRC computation processes data byte-by-byte, which is slow. The PCLMULQDQ
//! instruction enables parallel processing by treating CRC computation as polynomial
//! multiplication in GF(2) (Galois Field with 2 elements, where addition is XOR).
//!
//! The algorithm works by:
//! 1. Loading multiple data chunks into SIMD registers
//! 2. Using precomputed keys to "fold" distant chunks together
//! 3. Reducing the result modulo the CRC polynomial
//!
//! # What Are "Keys"?
//!
//! Keys are precomputed constants representing x^n mod P(x), where:
//! - x^n represents a bit shift of n positions
//! - P(x) is the CRC polynomial
//! - The modulo operation is polynomial division in GF(2)
//!
//! Each key answers: "If I have a CRC value, and I want to shift it by N bits and reduce it,
//! what constant should I multiply by?"
//!
//! # The Distance Concept
//!
//! The "distance" or "exponent" refers to how many bits apart two data chunks are. When
//! processing data in 128-bit (16-byte) SIMD registers:
//!
//! - For fold-by-8: We process 8 registers (128 bytes) at once
//! - Keys at distance 32*N (CRC-32) or 64*N (CRC-64) let us combine chunks N registers apart
//!
//! Example for CRC-32:
//! - Distance 32*3 = 96 bits = 12 bytes (3 registers of 32-bit data)
//! - This key folds together data chunks that are 12 bytes apart
//!
//! The larger distances (32*63, 32*65) handle 256-byte chunks for very high throughput, such as
//! using AVX-512 VPCLMULQDQ.

//! # Why CRC-32 and CRC-64 Implementations Differ So Greatly
//!
//! The implementations look very different, but they're solving the same problem at
//! different scales. Here are the key differences:
//!
//! ## 1. Polynomial Size
//!
//! - **CRC-32**: 33-bit polynomial (32 data bits + implicit leading 1)
//! - **CRC-64**: 65-bit polynomial (64 data bits + implicit leading 1)
//!
//! This fundamental difference cascades through every function.
//!
//! ## 2. Register Representation
//!
//! - **CRC-32**: Uses u64 with values in upper bits (e.g., 0x080000000 = bit 35)
//!   - Has "headroom" above 32 bits for intermediate calculations
//!   - Results often need alignment shifts (>> 31)
//!
//! - **CRC-64**: Uses u64 directly (e.g., 0x8000000000000000 = bit 63)
//!   - No headroom - must use two u64s for μ calculation
//!   - Results are naturally aligned, no extra shifts needed
//!
//! ## 3. Distance Scaling
//!
//! Both algorithms fold data in chunks, but the chunk sizes differ:
//!
//! | Concept | CRC-32 | CRC-64 | Why? |
//! |---------|--------|--------|------|
//! | Base unit | 32 bits (4 bytes) | 64 bits (8 bytes) | Matches polynomial width |
//! | Distance 1 | 96 bits (12 bytes) | 128 bits (16 bytes) | 3× vs 2× base unit |
//! | Large chunk | 2016 bits (252 bytes) | 2048 bits (256 bytes) | Aligned to cache lines |
//!
//! The multipliers are smaller for CRC-64 (2×, 3× vs 3×, 5×) because the base unit
//! is already larger.
//!
//! ## 4. Algorithm Variations
//!
//! - **Key generation**: CRC-32 uses simple if-check; CRC-64 uses bit-mask trick
//!   - Both compute x^n mod P(x), but CRC-64's approach is more efficient
//!
//! - **Mu calculation**: CRC-32 uses single u64; CRC-64 needs double-register
//!   - CRC-64 can't fit x^128 ÷ P(x) in one u64
//!
//! - **Iteration counts**: Off-by-one differences everywhere
//!   - Reflects different starting points and alignment requirements
//!
//! ## 5. Reflection Handling
//!
//! - **CRC-32**: Reverses 32 bits, then shifts by 31
//! - **CRC-64**: Reverses 64 bits, no additional shift
//!
//! Both create the reflected representation needed for LSB-first CRCs, but
//! CRC-64's cleaner alignment means less post-processing.
//!
//! ## The Big Picture
//!
//! Despite the implementation differences, both are doing the same thing:
//! precomputing constants that let PCLMULQDQ fold distant data chunks together
//! efficiently. The differences arise from working with different-sized polynomials
//! within the constraints of fixed-size integer types (u64).

#![allow(dead_code)]

use core::ops::{BitAnd, BitOr, Shl, Shr};

/// Exponents (bit distances) for CRC-16 key generation.
///
/// CRC-16 uses the same exponents as CRC-32 because the folding algorithm operates on
/// 128-bit SIMD registers regardless of CRC width. CRC-16 computation is performed by
/// scaling 16-bit values to 32-bit space, using the CRC-32 algorithm infrastructure,
/// and then scaling the result back to 16 bits.
///
/// See CRC32_EXPONENTS for detailed documentation of the exponent values.
const CRC16_EXPONENTS: [u64; 23] = [
    0, // unused, just aligns indexes with the literature
    32 * 3,
    32 * 5,
    32 * 31,
    32 * 33,
    32 * 3,
    32 * 2,
    0, // mu, generate separately
    0, // poly, generate separately
    32 * 27,
    32 * 29,
    32 * 23,
    32 * 25,
    32 * 19,
    32 * 21,
    32 * 15,
    32 * 17,
    32 * 11,
    32 * 13,
    32 * 7,
    32 * 9,
    32 * 63, // for 256 byte distances (2048 - 32)
    32 * 65, // for 256 byte distances (2048 + 32)
];

/// Exponents (bit distances) for CRC-32 key generation.
///
/// These represent bit distances for the fold-by-8 algorithm. Each exponent defines
/// how many bits apart two data chunks are when folding.
///
/// # Why These Specific Values?
///
/// For CRC-32, the polynomial is 33 bits (including the implicit leading 1), so we work
/// with 32-bit data chunks. The exponents are designed to efficiently fold 128-byte chunks:
///
/// - Indices 1-4: Primary folding constants for 8-chunk (128-byte) processing
///   - 32*3, 32*5: Used for folding the first half
///   - 32*31, 32*33: Used for folding the second half
/// - Indices 5-6: Short-distance folding
///   - 32*3, 32*2: For final reduction stages
/// - Indices 7-8: Special values (mu and polynomial) computed separately
/// - Indices 9-20: Additional folding distances for progressive reduction
///   - These allow folding from 8 chunks down to 4, then 2, then 1
/// - Indices 21-22: Large distances for 256-byte chunk processing, such as using AVX-512 VPCLMULQDQ
///   - 32*63 = 2016 bits (252 bytes)
///   - 32*65 = 2080 bits (260 bytes)
///   - These enable very high-throughput processing of large buffers
const CRC32_EXPONENTS: [u64; 23] = [
    0, // unused, just aligns indexes with the literature
    32 * 3,
    32 * 5,
    32 * 31,
    32 * 33,
    32 * 3,
    32 * 2,
    0, // mu, generate separately
    0, // poly, generate separately
    32 * 27,
    32 * 29,
    32 * 23,
    32 * 25,
    32 * 19,
    32 * 21,
    32 * 15,
    32 * 17,
    32 * 11,
    32 * 13,
    32 * 7,
    32 * 9,
    32 * 63, // for 256 byte distances (2048 - 32)
    32 * 65, // for 256 byte distances (2048 + 32)
];

/// Exponents (bit distances) for CRC-64 key generation.
///
/// Similar to CRC32_EXPONENTS but scaled for 64-bit CRC polynomials.
///
/// # Why Different From CRC-32?
///
/// CRC-64 uses a 65-bit polynomial (64 bits + implicit leading 1), so:
/// - Data chunks are 64 bits instead of 32
/// - All distances are in multiples of 64 instead of 32
/// - The actual byte distances are larger (64*N bits = 8*N bytes)
///
/// # Distance Comparison
///
/// | Index | CRC-32 Distance | CRC-64 Distance | Purpose              |
/// |-------|-----------------|-----------------|----------------------|
/// | 1     | 96 bits (12B)   | 128 bits (16B)  | Primary fold         |
/// | 2     | 160 bits (20B)  | 192 bits (24B)  | Primary fold         |
/// | 21    | 2016 bits (252B)| 2048 bits (256B)| Large chunk folding  |
///
/// The smaller multipliers (2, 3 vs 3, 5) reflect the larger base unit (64 vs 32 bits).
const CRC64_EXPONENTS: [u64; 23] = [
    0, // unused, just aligns indexes with the literature
    64 * 2,
    64 * 3,
    64 * 16,
    64 * 17,
    64 * 2,
    64,
    0, // mu, generate separately
    0, // poly, generate separately
    64 * 14,
    64 * 15,
    64 * 12,
    64 * 13,
    64 * 10,
    64 * 11,
    64 * 8,
    64 * 9,
    64 * 6,
    64 * 7,
    64 * 4,
    64 * 5,
    64 * 32, // for 256 byte distances (2048)
    64 * 33, // for 256 byte distances (2048 + 64)
];

/// Generates the 23 keys needed to calculate CRCs for a given polynomial using PCLMULQDQ when
/// folding by 8.
pub fn keys(width: u8, poly: u64, reflected: bool) -> [u64; 23] {
    let mut keys: [u64; 23] = [0; 23];

    let exponents = if 16 == width {
        CRC16_EXPONENTS
    } else if 32 == width {
        CRC32_EXPONENTS
    } else if 64 == width {
        CRC64_EXPONENTS
    } else {
        panic!("Unsupported width: {width}",);
    };

    let poly = if 16 == width {
        // CRC-16 uses a 17-bit polynomial (16 bits + implicit leading 1) scaled to 32-bit space
        (poly << 16) | (1u64 << 32)
    } else if 32 == width {
        poly | (1u64 << 32)
    } else {
        poly
    };

    for i in 1..23 {
        keys[i] = key(width, poly, reflected, exponents[i]);
    }

    keys[7] = mu(width, poly, reflected);
    keys[8] = polynomial(width, poly, reflected);

    keys
}

fn key(width: u8, poly: u64, reflected: bool, exponent: u64) -> u64 {
    if width == 16 {
        crc16_key(exponent, reflected, poly)
    } else if width == 32 {
        crc32_key(exponent, reflected, poly)
    } else if width == 64 {
        crc64_key(exponent, reflected, poly)
    } else {
        panic!("Unsupported width: {width}",);
    }
}

/// Computes a CRC-16 folding key for a given bit distance (exponent).
///
/// # Algorithm
///
/// CRC-16 key generation uses the same algorithm as CRC-32 because CRC-16 computation
/// is performed by scaling 16-bit values to 32-bit space. The polynomial is already
/// scaled to 32-bit space (poly << 16 | 1 << 32) before this function is called.
///
/// 1. Start with x^32 (represented as 0x080000000, bit 35 set)
/// 2. Multiply by x repeatedly (left shift), reducing modulo P(x) each time
/// 3. After (exponent - 31) iterations, we have x^exponent mod P(x)
///
/// # Reflection
///
/// For reflected CRC-16, we bit-reverse the 16-bit result and shift right by 31 bits
/// to align it properly for PCLMULQDQ operations.
fn crc16_key(exponent: u64, reflected: bool, polynomial: u64) -> u64 {
    if exponent < 32 {
        return 0;
    }

    let mut n: u64 = 0x080000000;
    let e = exponent - 31;

    for _ in 0..e {
        n <<= 1;
        if (n & 0x100000000) != 0 {
            n ^= polynomial;
        }
    }

    if reflected {
        bit_reverse(n) >> 31
    } else {
        n << 32
    }
}

/// Computes a CRC-32 folding key for a given bit distance (exponent).
///
/// # Algorithm
///
/// This calculates x^exponent mod P(x) where P(x) is the CRC-32 polynomial.
///
/// 1. Start with x^32 (represented as 0x080000000, bit 35 set)
/// 2. Multiply by x repeatedly (left shift), reducing modulo P(x) each time
/// 3. After (exponent - 31) iterations, we have x^exponent mod P(x)
///
/// # Why Start at x^32?
///
/// For CRC-32, we're working with 33-bit polynomials (32 data bits + leading 1).
/// Starting at x^32 (bit 35 in a 36-bit representation) gives us the right alignment
/// for the first reduction step.
///
/// # Why exponent - 31?
///
/// We start at x^32 and want to reach x^exponent, so we need (exponent - 32) more
/// multiplications. But the first shift happens before the first check, so it's
/// actually (exponent - 31) iterations.
///
/// # Reflection
///
/// If the CRC is reflected (LSB-first), we bit-reverse the result and shift right
/// by 31 bits to align it properly for PCLMULQDQ operations.
fn crc32_key(exponent: u64, reflected: bool, polynomial: u64) -> u64 {
    if exponent < 32 {
        return 0;
    }

    let mut n: u64 = 0x080000000;
    let e = exponent - 31;

    for _ in 0..e {
        n <<= 1;
        if (n & 0x100000000) != 0 {
            n ^= polynomial;
        }
    }

    if reflected {
        bit_reverse(n) >> 31
    } else {
        n << 32
    }
}

/// Computes a CRC-64 folding key for a given bit distance (exponent).
///
/// # Algorithm
///
/// Similar to CRC-32 but adapted for 64-bit polynomials:
///
/// 1. Start with x^63 (represented as 0x8000000000000000, bit 63 set)
/// 2. Multiply by x repeatedly, reducing modulo P(x) each time
/// 3. After (exponent - 63 or - 64) iterations, we have x^exponent mod P(x)
///
/// # Key Differences From CRC-32
///
/// 1. **Starting point**: 0x8000000000000000 vs 0x080000000
///    - CRC-64 uses bit 63 (the MSB of a 64-bit value)
///    - CRC-32 uses bit 35 (extended beyond 32 bits)
///
/// 2. **Iteration count**: Depends on reflection
///    - Reflected: exponent - 64 (because we iterate from x^64)
///    - Non-reflected: exponent - 63 (because we start at x^63)
///
/// 3. **Reduction step**: Uses a more efficient formulation
///    - `(n << 1) ^ ((0_u64.wrapping_sub(n >> 63)) & polynomial)`
///    - This is equivalent to: if MSB is set, shift and XOR with poly; else just shift
///    - The wrapping_sub creates a mask of all 1s or all 0s based on the MSB
///
/// 4. **No additional shift in result**: The result is already properly aligned
///    - CRC-32 needs `>> 31` adjustment for reflected case
///    - CRC-64 doesn't need this because it operates on full 64-bit values
fn crc64_key(exponent: u64, reflected: bool, polynomial: u64) -> u64 {
    if exponent <= 64 {
        return 0;
    }

    let mut n: u64 = 0x8000000000000000;
    let e = if reflected {
        exponent - 64
    } else {
        exponent - 63
    };

    for _ in 0..e {
        n = (n << 1) ^ ((0_u64.wrapping_sub(n >> 63)) & polynomial);
    }

    if reflected {
        bit_reverse(n)
    } else {
        n
    }
}

fn polynomial(width: u8, polynomial: u64, reflected: bool) -> u64 {
    if width == 16 {
        crc16_polynomial(polynomial, reflected)
    } else if width == 32 {
        crc32_polynomial(polynomial, reflected)
    } else if width == 64 {
        crc64_polynomial(polynomial, reflected)
    } else {
        panic!("Unsupported width: {width}",);
    }
}

/// Formats a CRC-16 polynomial for use in PCLMULQDQ operations.
///
/// # Polynomial Representation
///
/// The polynomial passed to this function is already scaled to 32-bit space
/// (original_poly << 16 | 1 << 32). This function formats it for PCLMULQDQ.
///
/// # Non-Reflected Case
///
/// For non-reflected CRCs, the polynomial is already in the correct format
/// from the scaling done in keys().
///
/// # Reflected Case
///
/// For reflected (LSB-first) CRCs, we need to:
/// 1. Extract the original 16-bit polynomial from the scaled value
/// 2. Bit-reverse the 16-bit polynomial
/// 3. Shift left by 1 bit and set the LSB to 1
fn crc16_polynomial(polynomial: u64, reflected: bool) -> u64 {
    if !reflected {
        return polynomial;
    }

    // Extract original 16-bit poly from scaled polynomial (poly << 16 | 1 << 32)
    let original_poly = ((polynomial >> 16) & 0xFFFF) as u16;
    let reversed = bit_reverse(original_poly);
    ((reversed as u64) << 1) | 1
}

/// Formats a CRC-32 polynomial for use in PCLMULQDQ operations.
///
/// # Polynomial Representation
///
/// CRC polynomials are typically written without the implicit leading 1. For example:
/// - CRC-32 (IEEE 802.3): 0x04C11DB7
/// - The full polynomial is actually 0x104C11DB7 (33 bits)
///
/// # Non-Reflected Case
///
/// For non-reflected CRCs, we simply add the leading 1:
/// - Input: 0x04C11DB7
/// - Output: 0x104C11DB7 (bit 32 set)
///
/// # Reflected Case
///
/// For reflected (LSB-first) CRCs, we need to:
/// 1. Bit-reverse the 32-bit polynomial value
/// 2. Shift left by 1 bit
/// 3. Set the LSB to 1
///
/// This creates the reflected polynomial in the format expected by PCLMULQDQ.
///
/// Example:
/// - Original: 0x04C11DB7
/// - Bit-reversed: 0xEDB88320
/// - Shifted and ORed: 0x1DB710641
fn crc32_polynomial(polynomial: u64, reflected: bool) -> u64 {
    if !reflected {
        return polynomial | (1u64 << 32);
    };

    // For 32-bit polynomials, operate on full 33 bits including leading 1
    let reversed = bit_reverse((polynomial & 0xFFFFFFFF) as u32);
    // Need to set bit 32 (33rd bit) to get the 1 in the right position after reflection
    ((reversed as u64) << 1) | 1
}

/// Formats a CRC-64 polynomial for use in PCLMULQDQ operations.
///
/// # Key Difference From CRC-32
///
/// # Non-Reflected Case
///
/// For non-reflected CRCs, we simply return the polynomial as-is.
///
/// # Reflected Case
///
/// For reflected CRC-64:
/// 1. Bit-reverse all 64 bits of the polynomial
/// 2. Shift left by 1
/// 3. Set LSB to 1
///
/// Unlike CRC-32 which only reverses 32 bits, this reverses the full 64-bit value.
fn crc64_polynomial(polynomial: u64, reflected: bool) -> u64 {
    if !reflected {
        return polynomial;
    };

    // For 64-bit polynomials, operate on all 64 bits
    (bit_reverse(polynomial) << 1) | 1
}

fn mu(width: u8, polynomial: u64, reflected: bool) -> u64 {
    if width == 16 {
        crc16_mu(polynomial, reflected)
    } else if width == 32 {
        crc32_mu(polynomial, reflected)
    } else if width == 64 {
        crc64_mu(polynomial, reflected)
    } else {
        panic!("Unsupported width: {width}",);
    }
}

/// Computes the Barrett reduction constant (μ) for CRC-16.
///
/// # What Is μ (Mu)?
///
/// Mu is used in Barrett reduction for fast modular reduction without division.
/// For CRC-16 operations (scaled to 32-bit space), μ = floor(x^64 / P(x)).
///
/// # Algorithm
///
/// This uses the same algorithm as CRC-32 mu calculation because CRC-16 is
/// computed in 32-bit space. The polynomial is already scaled (poly << 16 | 1 << 32).
///
/// 1. Start with x^64 (represented as 0x100000000, bit 32 set)
/// 2. For each bit position from MSB down:
///    - If the dividend has a bit at position 32, record a 1 in quotient
///    - XOR the dividend with the polynomial
///    - Shift dividend left
/// 3. After 33 iterations, q contains μ
fn crc16_mu(polynomial: u64, reflected: bool) -> u64 {
    let mut n: u64 = 0x100000000;
    let mut q: u64 = 0;

    for _ in 0..33 {
        q <<= 1;
        if n & 0x100000000 != 0 {
            q |= 1;
            n ^= polynomial;
        }
        n <<= 1;
    }

    if reflected {
        bit_reverse(q) >> 31
    } else {
        q
    }
}

/// Computes the Barrett reduction constant (μ) for CRC-32.
///
/// # What Is μ (Mu)?
///
/// Mu is used in Barrett reduction, an algorithm for fast modular reduction without division.
/// For CRC operations, μ = floor(x^(2n) / P(x)) where n is the polynomial degree.
///
/// # Algorithm
///
/// This performs polynomial long division:
/// 1. Start with x^64 (represented as 0x100000000, bit 32 set in a 65-bit view)
/// 2. For each bit position from MSB down:
///    - If the dividend has a bit at position 32, record a 1 in quotient
///    - XOR the dividend with the polynomial (subtract in GF(2))
///    - Shift dividend left
/// 3. After 33 iterations, q contains μ
///
/// # Why 33 Iterations?
///
/// We're dividing a 65-bit value (x^64) by a 33-bit polynomial, giving a 33-bit quotient.
///
/// # Reflection
///
/// If reflected, the result is bit-reversed and shifted right by 31 to align properly.
fn crc32_mu(polynomial: u64, reflected: bool) -> u64 {
    let mut n: u64 = 0x100000000;
    let mut q: u64 = 0;

    for _ in 0..33 {
        q <<= 1;
        if n & 0x100000000 != 0 {
            q |= 1;
            n ^= polynomial;
        }
        n <<= 1;
    }

    if reflected {
        bit_reverse(q) >> 31
    } else {
        q
    }
}

/// Computes the Barrett reduction constant (μ) for CRC-64.
///
/// # Key Differences From CRC-32
///
/// CRC-64 requires dividing a 129-bit value (x^128) by a 65-bit polynomial, which
/// doesn't fit in a single u64. This implementation uses two u64 variables to represent
/// a 128-bit dividend.
///
/// # Algorithm
///
/// 1. Start with x^128 represented as (n_hi=1, n_lo=0)
/// 2. For each bit position:
///    - If n_hi is non-zero, record 1 in quotient and XOR n_lo with polynomial
///    - Shift the double-register left: n_hi = n_lo >> 63; n_lo <<= 1
/// 3. After 64 or 65 iterations, q contains μ
///
/// # Why Two Registers?
///
/// - n_hi:n_lo forms a 128-bit value
/// - We can only XOR with the polynomial when "bit 64" is set (n_hi != 0)
/// - The shift operation propagates the MSB of n_lo into n_hi
///
/// # Iteration Count Difference
///
/// - Reflected: 64 iterations (matches the polynomial degree)
/// - Non-reflected: 65 iterations (one extra for proper alignment)
///
/// This asymmetry exists because reflected CRCs process data LSB-first,
/// changing the division mechanics slightly.
///
/// # No Additional Shift
///
/// Unlike CRC-32's `>> 31`, CRC-64 doesn't need an extra shift in the reflected
/// case because the 64-bit result is already properly aligned.
fn crc64_mu(polynomial: u64, reflected: bool) -> u64 {
    let mut n_hi: u64 = 0x0000000000000001;
    let mut n_lo: u64 = 0x0000000000000000;
    let mut q: u64 = 0;

    let max = if reflected { 64 } else { 65 };

    for _ in 0..max {
        q <<= 1;
        if n_hi != 0 {
            q |= 1;
            n_lo ^= polynomial;
        }
        n_hi = n_lo >> 63;
        n_lo <<= 1;
    }

    if reflected {
        bit_reverse(q)
    } else {
        q
    }
}

/// Reverses the bits of a value (MSB becomes LSB and vice versa).
///
/// # Purpose in CRC Calculations
///
/// Reflected (LSB-first) CRCs process data in reverse bit order. This function
/// converts between normal and reflected representations.
///
/// # Why Generic?
///
/// This works with any unsigned integer type (u32, u64) through trait bounds,
/// allowing the same code to handle both CRC-32 (which reverses 32 bits) and
/// CRC-64 (which reverses 64 bits).
///
/// # Example
///
/// For u32: 0x12345678 → 0x1E6A2C48
/// - Binary: 0001 0010 0011 0100 0101 0110 0111 1000
/// - Reversed: 0001 1110 0110 1010 0010 1100 0100 1000
fn bit_reverse<T>(mut value: T) -> T
where
    T: Copy
        + Default
        + PartialEq
        + BitAnd<Output = T>
        + BitOr<Output = T>
        + Shl<usize, Output = T>
        + Shr<usize, Output = T>
        + From<u8>,
{
    let one = T::from(1u8);
    let mut result = T::default(); // Zero value

    // Get the bit size of type T
    let bit_size = core::mem::size_of::<T>() * 8;

    for _ in 0..bit_size {
        // Shift result left by 1
        result = result << 1;

        // OR with least significant bit of value
        result = result | (value & one);

        // Shift value right by 1
        value = value >> 1;
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::crc16::consts::{KEYS_1021_REVERSE, KEYS_8BB7_FORWARD};
    use crate::test::consts::TEST_ALL_CONFIGS;

    #[test]
    fn test_all() {
        for config in TEST_ALL_CONFIGS {
            let keys = keys(config.get_width(), config.get_poly(), config.get_refin());
            let expected = config.get_keys();

            for (i, key) in keys.iter().enumerate() {
                assert_eq!(
                    *key,
                    expected[i],
                    "Mismatch in keys for {} at index {}: expected 0x{:016x}, got 0x{:016x}",
                    config.get_name(),
                    i,
                    expected[i],
                    *key
                );
            }
        }
    }

    #[test]
    fn test_crc16_t10_dif_keys() {
        // CRC-16/T10-DIF: forward mode, poly=0x8BB7
        let generated = keys(16, 0x8BB7, false);

        for (i, key) in generated.iter().enumerate().take(21) {
            assert_eq!(
                *key, KEYS_8BB7_FORWARD[i],
                "CRC-16/T10-DIF key mismatch at index {}: expected 0x{:016x}, got 0x{:016x}",
                i, KEYS_8BB7_FORWARD[i], *key
            );
        }
    }

    #[test]
    fn test_crc16_ibm_sdlc_keys() {
        // CRC-16/IBM-SDLC: reflected mode, poly=0x1021
        let generated = keys(16, 0x1021, true);

        for (i, key) in generated.iter().enumerate().take(21) {
            assert_eq!(
                *key, KEYS_1021_REVERSE[i],
                "CRC-16/IBM-SDLC key mismatch at index {}: expected 0x{:016x}, got 0x{:016x}",
                i, KEYS_1021_REVERSE[i], *key
            );
        }
    }
}

#[cfg(test)]
mod property_tests {
    use super::*;
    use crate::test::miri_compatible_proptest_config;
    use proptest::prelude::*;

    /// Helper function to bit-reverse a 16-bit value
    fn bit_reverse_16(value: u16) -> u16 {
        let mut v = value;
        let mut result: u16 = 0;
        for _ in 0..16 {
            result = (result << 1) | (v & 1);
            v >>= 1;
        }
        result
    }

    proptest! {
        #![proptest_config(miri_compatible_proptest_config())]

        /// Feature: crc16-hardware-acceleration, Property 1: Forward polynomial formatting
        /// *For any* 16-bit polynomial value P, when generating the polynomial key for forward
        /// (non-reflected) CRC-16, the result SHALL equal `(P << 16) | (1 << 32)`.
        /// **Validates: Requirements 1.5**
        #[test]
        fn prop_forward_polynomial_formatting(poly in 0u16..=0xFFFFu16) {
            let generated_keys = keys(16, poly as u64, false);
            let polynomial_key = generated_keys[8];
            let expected = ((poly as u64) << 16) | (1u64 << 32);
            prop_assert_eq!(
                polynomial_key, expected,
                "Forward polynomial formatting failed for poly=0x{:04X}: expected 0x{:016X}, got 0x{:016X}",
                poly, expected, polynomial_key
            );
        }

        /// Feature: crc16-hardware-acceleration, Property 2: Reflected polynomial formatting
        /// *For any* 16-bit polynomial value P, when generating the polynomial key for reflected
        /// CRC-16, the result SHALL equal `(bit_reverse_16(P) << 1) | 1`.
        /// **Validates: Requirements 1.6**
        #[test]
        fn prop_reflected_polynomial_formatting(poly in 0u16..=0xFFFFu16) {
            let generated_keys = keys(16, poly as u64, true);
            let polynomial_key = generated_keys[8];
            let reversed = bit_reverse_16(poly);
            let expected = ((reversed as u64) << 1) | 1;
            prop_assert_eq!(
                polynomial_key, expected,
                "Reflected polynomial formatting failed for poly=0x{:04X}: expected 0x{:016X}, got 0x{:016X}",
                poly, expected, polynomial_key
            );
        }
    }
}