ip4sum 0.1.0

Highly optimized IPv4 checksum calculation, no-std compatible
Documentation
// SPDX-License-Identifier: MIT | Copyright (c) 2026 Khashayar Fereidani

//! Incremental checksum state and core accumulation routine.

/// An incremental Internet checksum calculator.
///
/// This type works like a hash calculator: call [`update`](Checksum::update)
/// with successive chunks of data, then call [`finalize`](Checksum::finalize)
/// to obtain the 16-bit checksum.
///
/// # Examples
///
/// ```
/// use ip4sum::Checksum;
///
/// let mut hasher = Checksum::new();
/// hasher.update(b"hello");
/// let csum = hasher.finalize();
/// ```
#[derive(Clone, Debug)]
pub struct Checksum {
    acc: u64,
}

impl Checksum {
    /// Create a new checksum calculator with an initial accumulator of zero.
    #[inline]
    #[must_use]
    pub fn new() -> Self {
        Self { acc: 0 }
    }

    /// Feed a slice of data into the running checksum.
    #[inline]
    pub fn update(&mut self, data: &[u8]) {
        checksum_no_fold(data, &mut self.acc);
    }

    /// Consume the calculator and return the 16-bit one's-complement
    /// checksum in network byte order.
    #[inline]
    #[must_use]
    pub fn finalize(self) -> u16 {
        fold(self.acc)
    }

    /// Reset the calculator to its initial state without allocating.
    #[inline]
    pub fn reset(&mut self) {
        self.acc = 0;
    }
}

impl Default for Checksum {
    fn default() -> Self {
        Self::new()
    }
}

/// Accumulate the Internet checksum of `data` into `acc`.
///
/// Processes `data` in 32-bit chunks, adding each to the running
/// accumulator in native byte order. The endian correction is
/// deferred to [`fold`], which swaps once rather than on every call.
///
/// The tiered loop thresholds (128, 64, 4) serve as SIMD optimization
/// hints for the compiler. Each tier gives LLVM a distinct loop entry
/// point that can be unrolled and vectorized independently, which a
/// single flat loop cannot express. This is a deliberate design choice
/// to enable future SIMD intrinsics or portable SIMD without rewriting
/// the core accumulation logic.
fn checksum_no_fold(b: &[u8], acc: &mut u64) {
    let mut ac = *acc;
    let mut b = b;
    while b.len() >= 128 {
        while b.len() >= 4 {
            let num = u32::from_ne_bytes([b[0], b[1], b[2], b[3]]) as u64;
            ac = ac.wrapping_add(num);
            b = &b[4..];
        }
    }
    while b.len() >= 64 {
        while b.len() >= 4 {
            let num = u32::from_ne_bytes([b[0], b[1], b[2], b[3]]) as u64;
            ac = ac.wrapping_add(num);
            b = &b[4..];
        }
    }
    while b.len() >= 4 {
        let num = u32::from_ne_bytes([b[0], b[1], b[2], b[3]]) as u64;
        ac = ac.wrapping_add(num);
        b = &b[4..];
    }
    if b.len() >= 2 {
        let num = u16::from_ne_bytes([b[0], b[1]]) as u64;
        ac = ac.wrapping_add(num);
        b = &b[2..];
    }
    if !b.is_empty() {
        ac = ac.wrapping_add(b[0] as u64);
    }
    *acc = ac;
}

/// Fold a 64-bit native-order accumulator into a 16-bit one's-complement
/// checksum in network byte order.
///
/// One's complement addition is endian-independent: accumulating in native
/// order and byte-swapping only the final `u16` is equivalent to swapping
/// on every update. This saves two `swap_bytes` calls per update for the
/// cost of one `swap_bytes` at the end.
#[inline]
fn fold(acc: u64) -> u16 {
    let mut sum = acc;
    sum = (sum >> 32) + (sum & 0xffff_ffff);
    sum = (sum >> 16) + (sum & 0xffff);
    sum += sum >> 16;
    let result = !(sum as u16);
    if cfg!(target_endian = "little") {
        result.swap_bytes()
    } else {
        result
    }
}