arcbox-net 0.1.6

//! High-performance checksum calculation.
//!
//! This module provides optimized checksum calculation routines for
//! IP and TCP/UDP headers, including incremental updates for NAT.

/// Folds a 32-bit sum into a 16-bit checksum.
#[inline(always)]
pub fn checksum_fold(mut sum: u32) -> u16 {
    while sum >> 16 != 0 {
        sum = (sum & 0xFFFF) + (sum >> 16);
    }
    !sum as u16
}

/// Calculates the ones' complement sum of 16-bit words.
///
/// This is the core operation for IP/TCP/UDP checksums.
#[inline]
pub fn checksum_add(data: &[u8]) -> u32 {
    let mut sum: u32 = 0;
    let mut i = 0;

    // Process 16-bit words
    while i + 1 < data.len() {
        let word = u16::from_be_bytes([data[i], data[i + 1]]);
        sum = sum.wrapping_add(word as u32);
        i += 2;
    }

    // Handle odd byte
    if i < data.len() {
        sum = sum.wrapping_add((data[i] as u32) << 8);
    }

    sum
}

/// Calculates Internet checksum over data.
#[inline]
pub fn checksum(data: &[u8]) -> u16 {
    checksum_fold(checksum_add(data))
}

/// Incremental checksum update (RFC 1624).
///
/// When a 16-bit value changes from `old` to `new`, this function
/// efficiently updates an existing checksum without recalculating
/// the entire packet.
///
/// Formula: ~C' = ~C + ~m + m'
/// Where C is old checksum, m is old value, m' is new value.
#[inline(always)]
pub fn incremental_checksum_update(old_checksum: u16, old_value: u16, new_value: u16) -> u16 {
    // ~C + ~m + m'
    let sum = (!old_checksum as u32)
        .wrapping_add(!old_value as u32)
        .wrapping_add(new_value as u32);

    // Fold and complement
    checksum_fold(sum)
}

/// Updates checksum for a 32-bit (4-byte) field change.
///
/// Useful for IP address changes in NAT.
#[inline]
pub fn incremental_checksum_update_32(old_checksum: u16, old_value: u32, new_value: u32) -> u16 {
    let old_hi = (old_value >> 16) as u16;
    let old_lo = old_value as u16;
    let new_hi = (new_value >> 16) as u16;
    let new_lo = new_value as u16;

    // Update for high word
    let checksum = incremental_checksum_update(old_checksum, old_hi, new_hi);
    // Update for low word
    incremental_checksum_update(checksum, old_lo, new_lo)
}

/// Updates checksum for IP address change.
#[inline]
pub fn update_checksum_for_ip(old_checksum: u16, old_ip: [u8; 4], new_ip: [u8; 4]) -> u16 {
    let old_val = u32::from_be_bytes(old_ip);
    let new_val = u32::from_be_bytes(new_ip);
    incremental_checksum_update_32(old_checksum, old_val, new_val)
}

/// Updates checksum for port change.
#[inline]
pub fn update_checksum_for_port(old_checksum: u16, old_port: u16, new_port: u16) -> u16 {
    incremental_checksum_update(old_checksum, old_port, new_port)
}

/// Updates checksum for both IP and port change (common in NAT).
#[inline]
pub fn update_checksum_for_nat(
    old_checksum: u16,
    old_ip: [u8; 4],
    old_port: u16,
    new_ip: [u8; 4],
    new_port: u16,
) -> u16 {
    let checksum = update_checksum_for_ip(old_checksum, old_ip, new_ip);
    update_checksum_for_port(checksum, old_port, new_port)
}

/// Calculates IPv4 header checksum.
///
/// The header checksum covers only the IP header (not payload).
/// Assumes checksum field is zeroed before calculation.
#[inline]
pub fn ipv4_header_checksum(header: &[u8]) -> u16 {
    debug_assert!(header.len() >= 20, "IPv4 header too short");
    checksum(header)
}

/// Calculates TCP checksum including pseudo-header.
///
/// TCP checksum covers: pseudo-header + TCP header + data.
#[inline]
pub fn tcp_checksum(src_ip: [u8; 4], dst_ip: [u8; 4], tcp_segment: &[u8]) -> u16 {
    let mut sum: u32 = 0;

    // Pseudo-header
    sum = sum.wrapping_add(u16::from_be_bytes([src_ip[0], src_ip[1]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([src_ip[2], src_ip[3]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([dst_ip[0], dst_ip[1]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([dst_ip[2], dst_ip[3]]) as u32);
    sum = sum.wrapping_add(6u32); // Protocol (TCP = 6)
    sum = sum.wrapping_add(tcp_segment.len() as u32);

    // TCP segment
    sum = sum.wrapping_add(checksum_add(tcp_segment));

    checksum_fold(sum)
}

/// Calculates UDP checksum including pseudo-header.
///
/// UDP checksum covers: pseudo-header + UDP header + data.
#[inline]
pub fn udp_checksum(src_ip: [u8; 4], dst_ip: [u8; 4], udp_datagram: &[u8]) -> u16 {
    let mut sum: u32 = 0;

    // Pseudo-header
    sum = sum.wrapping_add(u16::from_be_bytes([src_ip[0], src_ip[1]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([src_ip[2], src_ip[3]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([dst_ip[0], dst_ip[1]]) as u32);
    sum = sum.wrapping_add(u16::from_be_bytes([dst_ip[2], dst_ip[3]]) as u32);
    sum = sum.wrapping_add(17u32); // Protocol (UDP = 17)
    sum = sum.wrapping_add(udp_datagram.len() as u32);

    // UDP datagram
    sum = sum.wrapping_add(checksum_add(udp_datagram));

    let result = checksum_fold(sum);
    // UDP uses 0xFFFF for zero checksum
    if result == 0 { 0xFFFF } else { result }
}

/// SIMD-optimized checksum for ARM64 NEON.
///
/// Uses NEON intrinsics to process 16 bytes at a time, with correct handling
/// of network byte order (big-endian 16-bit words).
///
/// # Safety
///
/// This function uses `#[target_feature(enable = "neon")]` and requires NEON support.
/// On AArch64, NEON is always available as part of the architecture specification.
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
pub unsafe fn checksum_simd_neon(data: &[u8]) -> u16 {
    use std::arch::aarch64::*;

    // SAFETY: All NEON intrinsics below are safe to call because:
    // 1. We have #[target_feature(enable = "neon")] ensuring NEON is available
    // 2. Pointer passed to vld1q_u8 is valid (from slice with length >= 16)
    unsafe {
        let mut sum = vdupq_n_u32(0);
        let chunks = data.chunks_exact(16);
        let remainder = chunks.remainder();

        for chunk in chunks {
            // Load 16 bytes from memory
            let bytes = vld1q_u8(chunk.as_ptr());

            // Network byte order is big-endian. On little-endian ARM64, we need to
            // swap bytes within each 16-bit word to get the correct checksum value.
            // vrev16q_u8 swaps adjacent bytes: [0,1,2,3,...] -> [1,0,3,2,...]
            let swapped = vrev16q_u8(bytes);

            // Now interpret as 16-bit words (already in correct order for summation)
            let words = vreinterpretq_u16_u8(swapped);

            // Pairwise add and accumulate to 32-bit to avoid overflow
            // vpadalq_u16 adds adjacent pairs of u16 into u32 accumulators
            sum = vpadalq_u16(sum, words);
        }

        // Horizontal sum of the four 32-bit lanes
        let sum32 = vaddvq_u32(sum);

        // Process remainder bytes using scalar code
        let mut scalar_sum = sum32;
        let mut i = 0;
        while i + 1 < remainder.len() {
            // Read big-endian 16-bit word
            let word = u16::from_be_bytes([remainder[i], remainder[i + 1]]);
            scalar_sum = scalar_sum.wrapping_add(word as u32);
            i += 2;
        }

        // Handle odd byte (padded with zero on the right in network order)
        if i < remainder.len() {
            scalar_sum = scalar_sum.wrapping_add((remainder[i] as u32) << 8);
        }

        // Fold 32-bit sum into 16-bit checksum
        while scalar_sum > 0xFFFF {
            scalar_sum = (scalar_sum & 0xFFFF) + (scalar_sum >> 16);
        }

        !scalar_sum as u16
    }
}

/// SIMD-optimized checksum for ARM64 NEON (safe wrapper).
///
/// This is the public safe interface that calls the unsafe NEON implementation.
/// NEON is always available on AArch64 processors.
#[cfg(target_arch = "aarch64")]
#[inline]
pub fn checksum_simd(data: &[u8]) -> u16 {
    // SAFETY: NEON is mandatory on AArch64 architecture
    unsafe { checksum_simd_neon(data) }
}

/// SIMD-optimized checksum for x86_64 using SSSE3.
///
/// Uses SSSE3 intrinsics to process 16 bytes at a time, with correct handling
/// of network byte order (big-endian 16-bit words).
///
/// SSSE3 is required for the `pshufb` instruction used for byte swapping.
/// SSSE3 is available on all x86_64 CPUs since Intel Core 2 (2006) and
/// AMD Barcelona (2007), covering essentially all modern x86_64 systems.
///
/// # Safety
///
/// This function uses `#[target_feature(enable = "ssse3")]` and requires SSSE3 support.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
unsafe fn checksum_simd_ssse3(data: &[u8]) -> u16 {
    use std::arch::x86_64::*;

    // SAFETY: All SSE/SSSE3 intrinsics below are safe to call because:
    // 1. We have #[target_feature(enable = "ssse3")] ensuring SSSE3 is available
    // 2. Pointers passed to _mm_loadu_si128 are valid (from slice with length >= 16)
    unsafe {
        // Accumulator: two 64-bit sums (we'll combine them at the end).
        let mut sum_lo = _mm_setzero_si128();
        let mut sum_hi = _mm_setzero_si128();

        // Shuffle mask to swap bytes within 16-bit words for big-endian interpretation.
        // Network byte order is big-endian, x86 is little-endian.
        // This mask converts [0,1,2,3,4,5,...] to [1,0,3,2,5,4,...] (swap adjacent bytes).
        let swap_mask = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);

        let chunks = data.chunks_exact(16);
        let remainder = chunks.remainder();

        for chunk in chunks {
            // Load 16 bytes from memory (unaligned load).
            let bytes = _mm_loadu_si128(chunk.as_ptr().cast());

            // Swap bytes within each 16-bit word for big-endian interpretation.
            let swapped = _mm_shuffle_epi8(bytes, swap_mask);

            // Unpack low and high halves to 32-bit words and add to accumulators.
            // _mm_unpacklo_epi16 with zero unpacks low 4 u16s to low 4 u32s.
            // _mm_unpackhi_epi16 with zero unpacks high 4 u16s to high 4 u32s.
            let zero = _mm_setzero_si128();
            let words_lo = _mm_unpacklo_epi16(swapped, zero); // 4 x u32 (words 0-3)
            let words_hi = _mm_unpackhi_epi16(swapped, zero); // 4 x u32 (words 4-7)

            // Add to accumulators.
            sum_lo = _mm_add_epi32(sum_lo, words_lo);
            sum_hi = _mm_add_epi32(sum_hi, words_hi);
        }

        // Combine lo and hi accumulators.
        let sum = _mm_add_epi32(sum_lo, sum_hi);

        // Horizontal sum of the four 32-bit lanes.
        // _mm_hadd_epi32: [a,b,c,d] + [a,b,c,d] => [a+b,c+d,a+b,c+d]
        let hadd1 = _mm_hadd_epi32(sum, sum);
        let hadd2 = _mm_hadd_epi32(hadd1, hadd1);
        let sum32 = _mm_cvtsi128_si32(hadd2) as u32;

        // Process remainder bytes using scalar code.
        let mut scalar_sum = sum32;
        let mut i = 0;
        while i + 1 < remainder.len() {
            // Read big-endian 16-bit word.
            let word = u16::from_be_bytes([remainder[i], remainder[i + 1]]);
            scalar_sum = scalar_sum.wrapping_add(word as u32);
            i += 2;
        }

        // Handle odd byte (padded with zero on the right in network order).
        if i < remainder.len() {
            scalar_sum = scalar_sum.wrapping_add((remainder[i] as u32) << 8);
        }

        // Fold 32-bit sum into 16-bit checksum.
        while scalar_sum > 0xFFFF {
            scalar_sum = (scalar_sum & 0xFFFF) + (scalar_sum >> 16);
        }

        !scalar_sum as u16
    }
}

/// SIMD-optimized checksum for x86_64 (safe wrapper).
///
/// This function detects SSSE3 support at runtime and uses the optimized
/// SIMD implementation if available, otherwise falls back to scalar.
#[cfg(target_arch = "x86_64")]
#[inline]
pub fn checksum_simd(data: &[u8]) -> u16 {
    // Check for SSSE3 support at runtime.
    // On modern x86_64 CPUs, SSSE3 is virtually always available.
    if is_x86_feature_detected!("ssse3") {
        // SAFETY: We just verified SSSE3 is available.
        unsafe { checksum_simd_ssse3(data) }
    } else {
        // Fallback to scalar implementation for ancient CPUs.
        checksum(data)
    }
}

/// Fallback for non-SIMD architectures.
#[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
pub fn checksum_simd(data: &[u8]) -> u16 {
    checksum(data)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_checksum_basic() {
        // Test vector from RFC 1071
        let data = [0x00, 0x01, 0xF2, 0x03, 0xF4, 0xF5, 0xF6, 0xF7];
        let sum = checksum(&data);
        // Expected: ~(0x0001 + 0xF203 + 0xF4F5 + 0xF6F7) = ~0x2DDF0 = ~0xDDF2 = 0x220D
        assert_eq!(sum, 0x220D);
    }

    #[test]
    fn test_checksum_empty() {
        assert_eq!(checksum(&[]), 0xFFFF);
    }

    #[test]
    fn test_checksum_odd_length() {
        let data = [0x01, 0x02, 0x03];
        let sum = checksum(&data);
        // 0x0102 + 0x0300 = 0x0402, ~0x0402 = 0xFBFD
        assert_eq!(sum, 0xFBFD);
    }

    #[test]
    fn test_incremental_update() {
        // Original data
        let data = [0x00, 0x01, 0x02, 0x03];
        let original_checksum = checksum(&data);

        // Change 0x0001 to 0x0005
        let updated = incremental_checksum_update(original_checksum, 0x0001, 0x0005);

        // Verify by recalculating
        let new_data = [0x00, 0x05, 0x02, 0x03];
        let recalculated = checksum(&new_data);

        assert_eq!(updated, recalculated);
    }

    #[test]
    fn test_incremental_update_ip() {
        let data: [u8; 8] = [
            192, 168, 1, 100, // Old IP: 192.168.1.100
            192, 168, 1, 1, // Dest IP: 192.168.1.1
        ];
        let original_checksum = checksum(&data);

        let new_ip = [10u8, 0, 0, 100]; // New IP: 10.0.0.100
        let updated = update_checksum_for_ip(original_checksum, [192, 168, 1, 100], new_ip);

        // Verify
        let new_data: [u8; 8] = [10, 0, 0, 100, 192, 168, 1, 1];
        let recalculated = checksum(&new_data);

        assert_eq!(updated, recalculated);
    }

    #[test]
    fn test_ipv4_checksum() {
        // Sample IPv4 header (20 bytes)
        let header: [u8; 20] = [
            0x45, 0x00, // Version, IHL, ToS
            0x00, 0x3c, // Total Length
            0x1c, 0x46, // Identification
            0x40, 0x00, // Flags, Fragment Offset
            0x40, 0x06, // TTL, Protocol (TCP)
            0x00, 0x00, // Checksum (zeroed for calculation)
            0xac, 0x10, 0x0a, 0x63, // Source IP: 172.16.10.99
            0xac, 0x10, 0x0a, 0x0c, // Dest IP: 172.16.10.12
        ];

        let checksum = ipv4_header_checksum(&header);
        // This should be a valid checksum
        assert_ne!(checksum, 0);
    }

    #[test]
    fn test_nat_checksum_update() {
        // Simulate NAT: 192.168.1.100:12345 -> 10.0.0.1:54321
        let original_checksum: u16 = 0x1234; // Dummy original

        let updated = update_checksum_for_nat(
            original_checksum,
            [192, 168, 1, 100],
            12345,
            [10, 0, 0, 1],
            54321,
        );

        // Just verify it produces a valid result
        assert_ne!(updated, original_checksum);
    }

    #[test]
    fn test_checksum_simd() {
        let data: Vec<u8> = (0..100).collect();
        let scalar = checksum(&data);
        let simd = checksum_simd(&data);
        assert_eq!(scalar, simd);
    }
}