xocomil 0.3.0 - Docs.rs

//! TCHAR validation — single source of truth for HTTP token character checks.
//!
//! Provides a [`TcharCheck`] trait with per-architecture SIMD implementations
//! using nibble-based parallel lookup (pshufb / vtbl / swizzle). Both
//! `scan.rs` and `validate.rs` consume this module instead of duplicating
//! TCHAR lookup logic.

/// Scalar TCHAR lookup table (RFC 7230 §3.2.6).
#[allow(clippy::redundant_pub_crate, clippy::cast_possible_truncation)]
pub(crate) static TABLE: [bool; 256] = {
    let mut t = [false; 256];
    let mut i = 0u16;
    while i < 256 {
        t[i as usize] = matches!(
            i as u8,
            b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+'
                | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~'
                | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z'
        );
        i += 1;
    }
    t
};

/// Nibble lookup tables for SIMD TCHAR validation.
///
/// For each input byte, split into `lo = byte & 0x0F` and `hi = byte >> 4`.
/// `LO_NIBBLES[lo]` gives a bitmask of valid high-nibble values for that
/// low nibble. `HI_NIBBLES[hi]` gives the single bit for that high nibble.
/// `LO_NIBBLES[lo] & HI_NIBBLES[hi] != 0` iff the byte is a valid TCHAR.
///
/// Available unconditionally on `x86_64` — the SSSE3 `pshufb` path uses
/// these tables when runtime detection succeeds, regardless of whether
/// the binary was built with `-C target-feature=+ssse3`.
#[cfg(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
))]
#[allow(clippy::redundant_pub_crate)]
pub(crate) const LO_NIBBLES: [u8; 16] = [
    0xE8, 0xFC, 0xF8, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xF8, 0xF8, 0xF4, 0x54, 0xD0, 0x54, 0xF4, 0x70,
];
#[cfg(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
))]
#[allow(clippy::redundant_pub_crate)]
pub(crate) const HI_NIBBLES: [u8; 16] = [
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];

/// Per-architecture TCHAR SIMD operations.
///
/// Each architecture implements [`mask16`](TcharCheck::mask16) (returns a
/// bitmask) and optionally overrides [`all16`](TcharCheck::all16) when a
/// boolean result is cheaper than a full bitmask (e.g. NEON `vminvq_u8`).
///
/// The provided [`is_valid_token`](TcharCheck::is_valid_token) method
/// handles 16-byte SIMD chunks + scalar tail, so call sites don't need
/// to implement the loop themselves.
#[allow(clippy::redundant_pub_crate)]
pub(crate) trait TcharCheck {
    /// Returns a 16-bit mask where bit `i` = 1 iff byte `i` is a valid TCHAR.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 16 bytes.
    unsafe fn mask16(ptr: *const u8) -> u32;

    /// Returns `true` if all 16 bytes at `ptr` are valid TCHARs.
    ///
    /// Default delegates to [`mask16`](Self::mask16); architectures without
    /// efficient bitmask extraction (e.g. NEON) should override.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 16 bytes.
    #[inline]
    unsafe fn all16(ptr: *const u8) -> bool {
        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe { Self::mask16(ptr) == 0xFFFF }
    }

    /// Validate an entire byte slice as a valid HTTP token.
    ///
    /// Uses SIMD for aligned 16-byte chunks, scalar [`TABLE`] for the tail.
    /// Returns `false` for empty slices.
    #[inline]
    fn is_valid_token(buf: &[u8]) -> bool {
        let len = buf.len();
        if len == 0 {
            return false;
        }
        let ptr = buf.as_ptr();
        let mut i = 0;
        while i + 16 <= len {
            // Safety: i + 16 <= len guarantees 16 readable bytes.
            if unsafe { !Self::all16(ptr.add(i)) } {
                return false;
            }
            i += 16;
        }
        while i < len {
            if !TABLE[buf[i] as usize] {
                return false;
            }
            i += 1;
        }
        true
    }
}

// ---------------------------------------------------------------------------
// x86-64 TCHAR validation tiers
// ---------------------------------------------------------------------------
//
// Three impls exist unconditionally on x86_64:
//
//   * `Avx2`     — 32-byte chunks, `vpshufb` (best, ~2013 and later)
//   * `Ssse3`    — 16-byte chunks, `pshufb` (good, any CPU since 2006)
//   * `Sse2Only` — 16-byte chunks, scalar TCHAR table (fallback)
//
// `scan.rs` and `validate.rs` runtime-dispatch between them once per
// call via `has_avx2()` / `has_ssse3()` (std caches the cpuid result
// internally after the first invocation). The dispatch trampolines
// are `#[target_feature(...)]`-attributed so the SIMD intrinsics
// inline into the hot loop instead of paying a function-call cost
// per chunk.

#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Ssse3;

#[cfg(target_arch = "x86_64")]
impl TcharCheck for Ssse3 {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        // `X86::mask16` (the only entry point) verifies SSSE3 support
        // before calling this, so `pshufb` is available here.
        unsafe { ssse3_mask16(ptr) }
    }
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
#[inline]
#[allow(clippy::cast_sign_loss)]
unsafe fn ssse3_mask16(ptr: *const u8) -> u32 {
    use std::arch::x86_64::{
        _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_set1_epi8,
        _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
    };

    // Safety: caller guarantees `ptr` is valid for 16 bytes;
    // `#[target_feature(enable = "ssse3")]` ensures pshufb is available.
    unsafe {
        let chunk = _mm_loadu_si128(ptr.cast());
        let lo_tbl = _mm_loadu_si128(LO_NIBBLES.as_ptr().cast());
        let hi_tbl = _mm_loadu_si128(HI_NIBBLES.as_ptr().cast());
        let nibble_mask = _mm_set1_epi8(0x0F);

        let lo_nib = _mm_and_si128(chunk, nibble_mask);
        let hi_nib = _mm_and_si128(_mm_srli_epi16(chunk, 4), nibble_mask);
        let lo_shuf = _mm_shuffle_epi8(lo_tbl, lo_nib);
        let hi_shuf = _mm_shuffle_epi8(hi_tbl, hi_nib);
        let valid = _mm_and_si128(lo_shuf, hi_shuf);

        // valid[i] != 0 → TCHAR. cmpeq with zero → 0xFF where invalid.
        let invalid = _mm_cmpeq_epi8(valid, _mm_setzero_si128());
        // Invert: bit i = 1 means byte i IS a TCHAR.
        (!_mm_movemask_epi8(invalid)) as u32 & 0xFFFF
    }
}

/// 32-byte AVX2 TCHAR validator using `vpshufb` nibble lookup.
///
/// Doubles the SIMD throughput of [`Ssse3`] by processing 32 bytes per
/// chunk. Available on every Intel CPU since Haswell (2013) and every
/// AMD since Excavator (2015). The runtime dispatcher in `scan.rs` /
/// `validate.rs` picks this when `has_avx2()` returns true.
#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Avx2;

#[cfg(target_arch = "x86_64")]
impl Avx2 {
    /// Returns a 32-bit mask where bit `i` = 1 iff byte `i` is a valid TCHAR.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 32 bytes. The caller must verify
    /// AVX2 is available on the host CPU before calling.
    #[inline]
    pub(crate) unsafe fn mask32(ptr: *const u8) -> u32 {
        // Safety: caller guarantees `ptr` is valid for 32 bytes and
        // that the AVX2 trampoline has confirmed CPU support.
        unsafe { avx2_mask32(ptr) }
    }
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
#[allow(clippy::cast_sign_loss)]
unsafe fn avx2_mask32(ptr: *const u8) -> u32 {
    use std::arch::x86_64::{
        _mm_loadu_si128, _mm256_and_si256, _mm256_broadcastsi128_si256, _mm256_cmpeq_epi8,
        _mm256_loadu_si256, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_setzero_si256,
        _mm256_shuffle_epi8, _mm256_srli_epi16,
    };

    // Safety: caller guarantees `ptr` is valid for 32 bytes;
    // `#[target_feature(enable = "avx2")]` ensures vpshufb is available.
    //
    // `_mm256_shuffle_epi8` does per-128-bit-lane shuffling, so we
    // broadcast each 16-byte lookup table into both lanes of the
    // 32-byte vector. Each lane then independently does the same
    // nibble-table lookup that `Ssse3::mask16` does.
    unsafe {
        let chunk = _mm256_loadu_si256(ptr.cast());
        let lo_tbl = _mm256_broadcastsi128_si256(_mm_loadu_si128(LO_NIBBLES.as_ptr().cast()));
        let hi_tbl = _mm256_broadcastsi128_si256(_mm_loadu_si128(HI_NIBBLES.as_ptr().cast()));
        let nibble_mask = _mm256_set1_epi8(0x0F);

        let lo_nib = _mm256_and_si256(chunk, nibble_mask);
        let hi_nib = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), nibble_mask);
        let lo_shuf = _mm256_shuffle_epi8(lo_tbl, lo_nib);
        let hi_shuf = _mm256_shuffle_epi8(hi_tbl, hi_nib);
        let valid = _mm256_and_si256(lo_shuf, hi_shuf);

        let invalid = _mm256_cmpeq_epi8(valid, _mm256_setzero_si256());
        // Invert: bit i = 1 means byte i IS a TCHAR.
        !(_mm256_movemask_epi8(invalid) as u32)
    }
}

/// Scalar TCHAR mask for x86-64 CPUs without SSSE3 (pre-2006 Intel,
/// pre-2011 AMD).
#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Sse2Only;

#[cfg(target_arch = "x86_64")]
impl TcharCheck for Sse2Only {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }
}

/// One-shot SSSE3 feature detection. Std caches the underlying
/// `cpuid` call internally after the first invocation, so subsequent
/// calls are a single atomic load.
///
/// Callers should branch on this **once** at the entry of a scanner /
/// validator, then call into a `#[target_feature(enable = "ssse3")]`-
/// attributed wrapper that monomorphizes the SIMD path. Per-chunk
/// dispatch defeats the inlining of `pshufb` and is significantly
/// slower than the scalar fallback on hot loops.
#[cfg(target_arch = "x86_64")]
#[inline]
#[allow(clippy::redundant_pub_crate)]
pub(crate) fn has_ssse3() -> bool {
    // The compiler folds this to `true` under `-C target-feature=+ssse3`.
    cfg!(target_feature = "ssse3") || std::is_x86_feature_detected!("ssse3")
}

/// One-shot AVX2 feature detection. Same cache-once semantics as
/// [`has_ssse3`] — callers branch on this once at scanner entry.
///
/// AVX2 implies SSSE3, so callers can dispatch
/// `has_avx2() ? avx2 : has_ssse3() ? ssse3 : scalar` and never need
/// a four-way table.
#[cfg(target_arch = "x86_64")]
#[inline]
#[allow(clippy::redundant_pub_crate)]
pub(crate) fn has_avx2() -> bool {
    cfg!(target_feature = "avx2") || std::is_x86_feature_detected!("avx2")
}

// ---------------------------------------------------------------------------
// aarch64: NEON vqtbl1q_u8
// ---------------------------------------------------------------------------

#[cfg(target_arch = "aarch64")]
pub(crate) struct Neon;

#[cfg(target_arch = "aarch64")]
impl TcharCheck for Neon {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        // Scalar fallback — NEON callers use all16 on the hot path.
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }

    #[inline]
    unsafe fn all16(ptr: *const u8) -> bool {
        use std::arch::aarch64::{
            vandq_u8, vdupq_n_u8, vld1q_u8, vminvq_u8, vqtbl1q_u8, vshrq_n_u8,
        };

        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe {
            let chunk = vld1q_u8(ptr);
            let lo_tbl = vld1q_u8(LO_NIBBLES.as_ptr());
            let hi_tbl = vld1q_u8(HI_NIBBLES.as_ptr());
            let nibble_mask = vdupq_n_u8(0x0F);

            let lo_nib = vandq_u8(chunk, nibble_mask);
            let hi_nib = vandq_u8(vshrq_n_u8(chunk, 4), nibble_mask);
            let lo_shuf = vqtbl1q_u8(lo_tbl, lo_nib);
            let hi_shuf = vqtbl1q_u8(hi_tbl, hi_nib);
            let valid = vandq_u8(lo_shuf, hi_shuf);

            // vminvq_u8 == 0 means at least one byte had valid == 0 → not TCHAR.
            vminvq_u8(valid) != 0
        }
    }
}

// ---------------------------------------------------------------------------
// wasm32 + simd128: u8x16_swizzle
// ---------------------------------------------------------------------------

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) struct WasmSimd;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
impl TcharCheck for WasmSimd {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        use std::arch::wasm32::{
            u8x16_bitmask, u8x16_eq, u8x16_splat, u8x16_swizzle, v128_and, v128_load,
        };

        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe {
            let chunk = v128_load(ptr.cast());
            let lo_tbl = v128_load(LO_NIBBLES.as_ptr().cast());
            let hi_tbl = v128_load(HI_NIBBLES.as_ptr().cast());
            let nibble_mask = u8x16_splat(0x0F);

            let lo_nib = v128_and(chunk, nibble_mask);
            let hi_nib = v128_and(std::arch::wasm32::u16x8_shr(chunk, 4), nibble_mask);
            let lo_shuf = u8x16_swizzle(lo_tbl, lo_nib);
            let hi_shuf = u8x16_swizzle(hi_tbl, hi_nib);
            let valid = v128_and(lo_shuf, hi_shuf);

            let invalid = u8x16_eq(valid, u8x16_splat(0));
            (!u8x16_bitmask(invalid)) as u32 & 0xFFFF
        }
    }
}

// ---------------------------------------------------------------------------
// Scalar fallback
// ---------------------------------------------------------------------------

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
pub(crate) struct Scalar;

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
impl TcharCheck for Scalar {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }
}

// ---------------------------------------------------------------------------
// Dispatch alias — resolves to the best implementation for the target.
// ---------------------------------------------------------------------------

// On x86_64 there is no single `Arch` alias — the public scanners in
// `scan.rs` and `validate.rs` runtime-dispatch between `Ssse3` and
// `Sse2Only` once at entry via `has_ssse3()`. Per-chunk dispatch
// defeats inlining of `pshufb` and ends up slower than the scalar
// fallback.

#[cfg(target_arch = "aarch64")]
pub(crate) type Arch = Neon;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) type Arch = WasmSimd;

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
pub(crate) type Arch = Scalar;