xocomil 0.3.0

A lightweight, zero-allocation HTTP/1.1 request parser and response writer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
//! TCHAR validation — single source of truth for HTTP token character checks.
//!
//! Provides a [`TcharCheck`] trait with per-architecture SIMD implementations
//! using nibble-based parallel lookup (pshufb / vtbl / swizzle). Both
//! `scan.rs` and `validate.rs` consume this module instead of duplicating
//! TCHAR lookup logic.

/// Scalar TCHAR lookup table (RFC 7230 §3.2.6).
#[allow(clippy::redundant_pub_crate, clippy::cast_possible_truncation)]
pub(crate) static TABLE: [bool; 256] = {
    let mut t = [false; 256];
    let mut i = 0u16;
    while i < 256 {
        t[i as usize] = matches!(
            i as u8,
            b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+'
                | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~'
                | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z'
        );
        i += 1;
    }
    t
};

/// Nibble lookup tables for SIMD TCHAR validation.
///
/// For each input byte, split into `lo = byte & 0x0F` and `hi = byte >> 4`.
/// `LO_NIBBLES[lo]` gives a bitmask of valid high-nibble values for that
/// low nibble. `HI_NIBBLES[hi]` gives the single bit for that high nibble.
/// `LO_NIBBLES[lo] & HI_NIBBLES[hi] != 0` iff the byte is a valid TCHAR.
///
/// Available unconditionally on `x86_64` — the SSSE3 `pshufb` path uses
/// these tables when runtime detection succeeds, regardless of whether
/// the binary was built with `-C target-feature=+ssse3`.
#[cfg(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
))]
#[allow(clippy::redundant_pub_crate)]
pub(crate) const LO_NIBBLES: [u8; 16] = [
    0xE8, 0xFC, 0xF8, 0xFC, 0xFC, 0xFC, 0xFC, 0xFC, 0xF8, 0xF8, 0xF4, 0x54, 0xD0, 0x54, 0xF4, 0x70,
];
#[cfg(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
))]
#[allow(clippy::redundant_pub_crate)]
pub(crate) const HI_NIBBLES: [u8; 16] = [
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];

/// Per-architecture TCHAR SIMD operations.
///
/// Each architecture implements [`mask16`](TcharCheck::mask16) (returns a
/// bitmask) and optionally overrides [`all16`](TcharCheck::all16) when a
/// boolean result is cheaper than a full bitmask (e.g. NEON `vminvq_u8`).
///
/// The provided [`is_valid_token`](TcharCheck::is_valid_token) method
/// handles 16-byte SIMD chunks + scalar tail, so call sites don't need
/// to implement the loop themselves.
#[allow(clippy::redundant_pub_crate)]
pub(crate) trait TcharCheck {
    /// Returns a 16-bit mask where bit `i` = 1 iff byte `i` is a valid TCHAR.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 16 bytes.
    unsafe fn mask16(ptr: *const u8) -> u32;

    /// Returns `true` if all 16 bytes at `ptr` are valid TCHARs.
    ///
    /// Default delegates to [`mask16`](Self::mask16); architectures without
    /// efficient bitmask extraction (e.g. NEON) should override.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 16 bytes.
    #[inline]
    unsafe fn all16(ptr: *const u8) -> bool {
        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe { Self::mask16(ptr) == 0xFFFF }
    }

    /// Validate an entire byte slice as a valid HTTP token.
    ///
    /// Uses SIMD for aligned 16-byte chunks, scalar [`TABLE`] for the tail.
    /// Returns `false` for empty slices.
    #[inline]
    fn is_valid_token(buf: &[u8]) -> bool {
        let len = buf.len();
        if len == 0 {
            return false;
        }
        let ptr = buf.as_ptr();
        let mut i = 0;
        while i + 16 <= len {
            // Safety: i + 16 <= len guarantees 16 readable bytes.
            if unsafe { !Self::all16(ptr.add(i)) } {
                return false;
            }
            i += 16;
        }
        while i < len {
            if !TABLE[buf[i] as usize] {
                return false;
            }
            i += 1;
        }
        true
    }
}

// ---------------------------------------------------------------------------
// x86-64 TCHAR validation tiers
// ---------------------------------------------------------------------------
//
// Three impls exist unconditionally on x86_64:
//
//   * `Avx2`     — 32-byte chunks, `vpshufb` (best, ~2013 and later)
//   * `Ssse3`    — 16-byte chunks, `pshufb` (good, any CPU since 2006)
//   * `Sse2Only` — 16-byte chunks, scalar TCHAR table (fallback)
//
// `scan.rs` and `validate.rs` runtime-dispatch between them once per
// call via `has_avx2()` / `has_ssse3()` (std caches the cpuid result
// internally after the first invocation). The dispatch trampolines
// are `#[target_feature(...)]`-attributed so the SIMD intrinsics
// inline into the hot loop instead of paying a function-call cost
// per chunk.

#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Ssse3;

#[cfg(target_arch = "x86_64")]
impl TcharCheck for Ssse3 {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        // `X86::mask16` (the only entry point) verifies SSSE3 support
        // before calling this, so `pshufb` is available here.
        unsafe { ssse3_mask16(ptr) }
    }
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "ssse3")]
#[inline]
#[allow(clippy::cast_sign_loss)]
unsafe fn ssse3_mask16(ptr: *const u8) -> u32 {
    use std::arch::x86_64::{
        _mm_and_si128, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_set1_epi8,
        _mm_setzero_si128, _mm_shuffle_epi8, _mm_srli_epi16,
    };

    // Safety: caller guarantees `ptr` is valid for 16 bytes;
    // `#[target_feature(enable = "ssse3")]` ensures pshufb is available.
    unsafe {
        let chunk = _mm_loadu_si128(ptr.cast());
        let lo_tbl = _mm_loadu_si128(LO_NIBBLES.as_ptr().cast());
        let hi_tbl = _mm_loadu_si128(HI_NIBBLES.as_ptr().cast());
        let nibble_mask = _mm_set1_epi8(0x0F);

        let lo_nib = _mm_and_si128(chunk, nibble_mask);
        let hi_nib = _mm_and_si128(_mm_srli_epi16(chunk, 4), nibble_mask);
        let lo_shuf = _mm_shuffle_epi8(lo_tbl, lo_nib);
        let hi_shuf = _mm_shuffle_epi8(hi_tbl, hi_nib);
        let valid = _mm_and_si128(lo_shuf, hi_shuf);

        // valid[i] != 0 → TCHAR. cmpeq with zero → 0xFF where invalid.
        let invalid = _mm_cmpeq_epi8(valid, _mm_setzero_si128());
        // Invert: bit i = 1 means byte i IS a TCHAR.
        (!_mm_movemask_epi8(invalid)) as u32 & 0xFFFF
    }
}

/// 32-byte AVX2 TCHAR validator using `vpshufb` nibble lookup.
///
/// Doubles the SIMD throughput of [`Ssse3`] by processing 32 bytes per
/// chunk. Available on every Intel CPU since Haswell (2013) and every
/// AMD since Excavator (2015). The runtime dispatcher in `scan.rs` /
/// `validate.rs` picks this when `has_avx2()` returns true.
#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Avx2;

#[cfg(target_arch = "x86_64")]
impl Avx2 {
    /// Returns a 32-bit mask where bit `i` = 1 iff byte `i` is a valid TCHAR.
    ///
    /// # Safety
    ///
    /// `ptr` must be valid for reads of 32 bytes. The caller must verify
    /// AVX2 is available on the host CPU before calling.
    #[inline]
    pub(crate) unsafe fn mask32(ptr: *const u8) -> u32 {
        // Safety: caller guarantees `ptr` is valid for 32 bytes and
        // that the AVX2 trampoline has confirmed CPU support.
        unsafe { avx2_mask32(ptr) }
    }
}

#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
#[inline]
#[allow(clippy::cast_sign_loss)]
unsafe fn avx2_mask32(ptr: *const u8) -> u32 {
    use std::arch::x86_64::{
        _mm_loadu_si128, _mm256_and_si256, _mm256_broadcastsi128_si256, _mm256_cmpeq_epi8,
        _mm256_loadu_si256, _mm256_movemask_epi8, _mm256_set1_epi8, _mm256_setzero_si256,
        _mm256_shuffle_epi8, _mm256_srli_epi16,
    };

    // Safety: caller guarantees `ptr` is valid for 32 bytes;
    // `#[target_feature(enable = "avx2")]` ensures vpshufb is available.
    //
    // `_mm256_shuffle_epi8` does per-128-bit-lane shuffling, so we
    // broadcast each 16-byte lookup table into both lanes of the
    // 32-byte vector. Each lane then independently does the same
    // nibble-table lookup that `Ssse3::mask16` does.
    unsafe {
        let chunk = _mm256_loadu_si256(ptr.cast());
        let lo_tbl = _mm256_broadcastsi128_si256(_mm_loadu_si128(LO_NIBBLES.as_ptr().cast()));
        let hi_tbl = _mm256_broadcastsi128_si256(_mm_loadu_si128(HI_NIBBLES.as_ptr().cast()));
        let nibble_mask = _mm256_set1_epi8(0x0F);

        let lo_nib = _mm256_and_si256(chunk, nibble_mask);
        let hi_nib = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), nibble_mask);
        let lo_shuf = _mm256_shuffle_epi8(lo_tbl, lo_nib);
        let hi_shuf = _mm256_shuffle_epi8(hi_tbl, hi_nib);
        let valid = _mm256_and_si256(lo_shuf, hi_shuf);

        let invalid = _mm256_cmpeq_epi8(valid, _mm256_setzero_si256());
        // Invert: bit i = 1 means byte i IS a TCHAR.
        !(_mm256_movemask_epi8(invalid) as u32)
    }
}

/// Scalar TCHAR mask for x86-64 CPUs without SSSE3 (pre-2006 Intel,
/// pre-2011 AMD).
#[cfg(target_arch = "x86_64")]
#[allow(clippy::redundant_pub_crate)]
pub(crate) struct Sse2Only;

#[cfg(target_arch = "x86_64")]
impl TcharCheck for Sse2Only {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }
}

/// One-shot SSSE3 feature detection. Std caches the underlying
/// `cpuid` call internally after the first invocation, so subsequent
/// calls are a single atomic load.
///
/// Callers should branch on this **once** at the entry of a scanner /
/// validator, then call into a `#[target_feature(enable = "ssse3")]`-
/// attributed wrapper that monomorphizes the SIMD path. Per-chunk
/// dispatch defeats the inlining of `pshufb` and is significantly
/// slower than the scalar fallback on hot loops.
#[cfg(target_arch = "x86_64")]
#[inline]
#[allow(clippy::redundant_pub_crate)]
pub(crate) fn has_ssse3() -> bool {
    // The compiler folds this to `true` under `-C target-feature=+ssse3`.
    cfg!(target_feature = "ssse3") || std::is_x86_feature_detected!("ssse3")
}

/// One-shot AVX2 feature detection. Same cache-once semantics as
/// [`has_ssse3`] — callers branch on this once at scanner entry.
///
/// AVX2 implies SSSE3, so callers can dispatch
/// `has_avx2() ? avx2 : has_ssse3() ? ssse3 : scalar` and never need
/// a four-way table.
#[cfg(target_arch = "x86_64")]
#[inline]
#[allow(clippy::redundant_pub_crate)]
pub(crate) fn has_avx2() -> bool {
    cfg!(target_feature = "avx2") || std::is_x86_feature_detected!("avx2")
}

// ---------------------------------------------------------------------------
// aarch64: NEON vqtbl1q_u8
// ---------------------------------------------------------------------------

#[cfg(target_arch = "aarch64")]
pub(crate) struct Neon;

#[cfg(target_arch = "aarch64")]
impl TcharCheck for Neon {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        // Scalar fallback — NEON callers use all16 on the hot path.
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }

    #[inline]
    unsafe fn all16(ptr: *const u8) -> bool {
        use std::arch::aarch64::{
            vandq_u8, vdupq_n_u8, vld1q_u8, vminvq_u8, vqtbl1q_u8, vshrq_n_u8,
        };

        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe {
            let chunk = vld1q_u8(ptr);
            let lo_tbl = vld1q_u8(LO_NIBBLES.as_ptr());
            let hi_tbl = vld1q_u8(HI_NIBBLES.as_ptr());
            let nibble_mask = vdupq_n_u8(0x0F);

            let lo_nib = vandq_u8(chunk, nibble_mask);
            let hi_nib = vandq_u8(vshrq_n_u8(chunk, 4), nibble_mask);
            let lo_shuf = vqtbl1q_u8(lo_tbl, lo_nib);
            let hi_shuf = vqtbl1q_u8(hi_tbl, hi_nib);
            let valid = vandq_u8(lo_shuf, hi_shuf);

            // vminvq_u8 == 0 means at least one byte had valid == 0 → not TCHAR.
            vminvq_u8(valid) != 0
        }
    }
}

// ---------------------------------------------------------------------------
// wasm32 + simd128: u8x16_swizzle
// ---------------------------------------------------------------------------

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) struct WasmSimd;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
impl TcharCheck for WasmSimd {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        use std::arch::wasm32::{
            u8x16_bitmask, u8x16_eq, u8x16_splat, u8x16_swizzle, v128_and, v128_load,
        };

        // Safety: caller guarantees `ptr` is valid for 16 bytes.
        unsafe {
            let chunk = v128_load(ptr.cast());
            let lo_tbl = v128_load(LO_NIBBLES.as_ptr().cast());
            let hi_tbl = v128_load(HI_NIBBLES.as_ptr().cast());
            let nibble_mask = u8x16_splat(0x0F);

            let lo_nib = v128_and(chunk, nibble_mask);
            let hi_nib = v128_and(std::arch::wasm32::u16x8_shr(chunk, 4), nibble_mask);
            let lo_shuf = u8x16_swizzle(lo_tbl, lo_nib);
            let hi_shuf = u8x16_swizzle(hi_tbl, hi_nib);
            let valid = v128_and(lo_shuf, hi_shuf);

            let invalid = u8x16_eq(valid, u8x16_splat(0));
            (!u8x16_bitmask(invalid)) as u32 & 0xFFFF
        }
    }
}

// ---------------------------------------------------------------------------
// Scalar fallback
// ---------------------------------------------------------------------------

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
pub(crate) struct Scalar;

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
impl TcharCheck for Scalar {
    #[inline]
    unsafe fn mask16(ptr: *const u8) -> u32 {
        let mut m = 0u32;
        for i in 0..16 {
            // Safety: caller guarantees `ptr` is valid for 16 bytes.
            if TABLE[unsafe { *ptr.add(i) } as usize] {
                m |= 1 << i;
            }
        }
        m
    }
}

// ---------------------------------------------------------------------------
// Dispatch alias — resolves to the best implementation for the target.
// ---------------------------------------------------------------------------

// On x86_64 there is no single `Arch` alias — the public scanners in
// `scan.rs` and `validate.rs` runtime-dispatch between `Ssse3` and
// `Sse2Only` once at entry via `has_ssse3()`. Per-chunk dispatch
// defeats inlining of `pshufb` and ends up slower than the scalar
// fallback.

#[cfg(target_arch = "aarch64")]
pub(crate) type Arch = Neon;

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
pub(crate) type Arch = WasmSimd;

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "aarch64",
    all(target_arch = "wasm32", target_feature = "simd128")
)))]
pub(crate) type Arch = Scalar;