Skip to main content

fhp_simd/
lib.rs

1/// Runtime SIMD feature detection and dispatch.
2pub mod dispatch;
3/// Portable scalar fallback — works on every platform.
4pub mod scalar;
5
6#[cfg(target_arch = "x86_64")]
7/// SSE4.2 accelerated operations (128-bit, x86_64).
8pub mod sse42;
9
10#[cfg(target_arch = "x86_64")]
11/// AVX2 accelerated operations (256-bit, x86_64).
12pub mod avx2;
13
14#[cfg(target_arch = "aarch64")]
15/// ARM NEON accelerated operations (128-bit, aarch64).
16pub mod neon;
17
18/// The set of delimiters scanned by [`find_delimiters`](dispatch::SimdOps).
19///
20/// These are the bytes that have structural significance in HTML:
21/// `<`, `>`, `&`, `"`, `'`, `=`, `/`.
22pub const DELIMITERS: [u8; 7] = [b'<', b'>', b'&', b'"', b'\'', b'=', b'/'];
23
24/// Result of a multi-delimiter scan over a byte slice.
25#[derive(Clone, Copy, Debug, PartialEq, Eq)]
26pub enum DelimiterResult {
27    /// A delimiter was found at the given position.
28    Found {
29        /// Byte offset within the searched slice.
30        pos: usize,
31        /// The delimiter byte that was matched.
32        byte: u8,
33    },
34    /// No delimiter was found in the slice.
35    NotFound,
36}
37
38impl DelimiterResult {
39    /// Shift the position by `offset`, used when falling back to scalar for
40    /// a tail slice.
41    #[inline]
42    pub fn offset_by(self, offset: usize) -> DelimiterResult {
43        match self {
44            DelimiterResult::Found { pos, byte } => DelimiterResult::Found {
45                pos: pos + offset,
46                byte,
47            },
48            DelimiterResult::NotFound => DelimiterResult::NotFound,
49        }
50    }
51}
52
53/// Byte classification categories produced by [`classify_bytes`](dispatch::SimdOps).
54///
55/// Each bit represents a category. A byte may belong to multiple categories
56/// (e.g. `<` is both `DELIMITER` and `OTHER` is absent).
57pub mod class {
58    /// ASCII whitespace: space, tab, newline, carriage return.
59    pub const WHITESPACE: u8 = 0b0000_0001;
60    /// ASCII alphabetic: `a-z`, `A-Z`.
61    pub const ALPHA: u8 = 0b0000_0010;
62    /// ASCII digit: `0-9`.
63    pub const DIGIT: u8 = 0b0000_0100;
64    /// HTML structural delimiter: `<`, `>`, `&`, `"`, `'`, `=`, `/`.
65    pub const DELIMITER: u8 = 0b0000_1000;
66    /// None of the above.
67    pub const OTHER: u8 = 0b0000_0000;
68}
69
70/// Delimiter bitmasks for a block of up to 64 bytes.
71///
72/// Produced by [`compute_all_masks`](dispatch::SimdOps) which loads each
73/// 16-byte chunk only once, computing all masks in a single pass.
74/// Only the four masks actually consumed by the fused tokenizer are computed.
75#[derive(Clone, Copy, Debug, Default)]
76pub struct AllMasks {
77    /// `<` positions.
78    pub lt: u64,
79    /// `>` positions.
80    pub gt: u64,
81    /// `"` positions.
82    pub quot: u64,
83    /// `'` positions.
84    pub apos: u64,
85}
86
87/// Classify a single byte into one of the [`class`] categories.
88#[inline(always)]
89pub fn classify_byte(b: u8) -> u8 {
90    match b {
91        b' ' | b'\t' | b'\n' | b'\r' => class::WHITESPACE,
92        b'a'..=b'z' | b'A'..=b'Z' => class::ALPHA,
93        b'0'..=b'9' => class::DIGIT,
94        b'<' | b'>' | b'&' | b'"' | b'\'' | b'=' | b'/' => class::DELIMITER,
95        _ => class::OTHER,
96    }
97}