Skip to main content

fhp_tokenizer/
lib.rs

1//! SIMD-accelerated HTML tokenizer.
2//!
3//! Uses a two-stage pipeline inspired by simdjson:
4//!
5//! 1. **Structural indexing** (SIMD): scan input in 64-byte blocks, produce
6//!    per-delimiter bitmasks, then apply quote-aware masking.
7//! 2. **Token extraction** (scalar): walk the structural index to emit tokens
8//!    via a branchless state machine.
9//!
10//! # Quick Start
11//!
12//! ```
13//! use fhp_tokenizer::tokenize;
14//!
15//! let tokens = tokenize("<div>hello</div>");
16//! assert!(tokens.len() >= 3);
17//! ```
18
19/// Entity decoding with SIMD fast-path.
20pub mod entity;
21/// Token extraction — stage 2 (scalar state machine).
22pub mod extract;
23/// Branchless state machine for token extraction.
24pub mod state_machine;
25/// Streaming (chunk-based) tokenizer — see [`crate::streaming::StreamTokenizer`].
26pub mod streaming;
27/// Structural character indexer — SIMD-powered bitmask generation (stage 1).
28pub mod structural;
29/// Token types emitted by the tokenizer.
30pub mod token;
31
32use extract::extract_tokens;
33use structural::StructuralIndexer;
34use token::Token;
35
36use fhp_core::tag::Tag;
37use fhp_simd::AllMasks;
38use fhp_simd::dispatch::ops;
39
40/// Trait for receiving parsed HTML events directly, bypassing Token allocation.
41///
42/// Implementors receive raw parse events (open/close tags, text, etc.) and
43/// can build their output structure in a single pass without intermediate
44/// `Token` enum or `Vec<Attribute>` allocations.
45pub trait TreeSink {
46    /// An opening tag was encountered.
47    ///
48    /// `attr_raw` is the raw attribute region between the tag name and `>`.
49    /// The implementor is responsible for parsing attributes from this slice.
50    fn open_tag(&mut self, tag: Tag, name: &str, attr_raw: &str, self_closing: bool);
51
52    /// A closing tag was encountered.
53    fn close_tag(&mut self, tag: Tag, name: &str);
54
55    /// Raw text content (not entity-decoded).
56    ///
57    /// The implementor is responsible for entity decoding if desired.
58    fn text(&mut self, content: &str);
59
60    /// A comment was encountered.
61    fn comment(&mut self, content: &str);
62
63    /// A DOCTYPE declaration was encountered.
64    fn doctype(&mut self, content: &str);
65
66    /// A CDATA section was encountered.
67    fn cdata(&mut self, content: &str);
68}
69
70/// Tokenize an HTML string into a sequence of tokens.
71///
72/// Convenience wrapper that runs both pipeline stages:
73/// 1. SIMD structural indexing
74/// 2. State-machine token extraction
75///
76/// # Example
77///
78/// ```
79/// use fhp_tokenizer::tokenize;
80/// use fhp_tokenizer::token::Token;
81///
82/// let tokens = tokenize("<p>Hello &amp; world</p>");
83///
84/// // Should contain OpenTag, Text, CloseTag
85/// assert!(tokens.iter().any(|t| matches!(t, Token::OpenTag { .. })));
86/// assert!(tokens.iter().any(|t| matches!(t, Token::Text { .. })));
87/// assert!(tokens.iter().any(|t| matches!(t, Token::CloseTag { .. })));
88/// ```
89pub fn tokenize<'a>(input: &'a str) -> Vec<Token<'a>> {
90    let indexer = StructuralIndexer::new();
91    let index = indexer.index(input.as_bytes());
92    extract_tokens(input, &index)
93}
94
95/// Size of each processing block in bytes.
96const BLOCK_SIZE: usize = 64;
97
98/// Tokenize HTML and feed each token to a callback — zero intermediate allocation.
99///
100/// Fuses SIMD structural indexing, quote-aware masking, and token extraction
101/// into a single streaming pass. No `Vec<BlockBitmaps>` or `Vec<Token>` is
102/// allocated.
103///
104/// # Example
105///
106/// ```
107/// use fhp_tokenizer::tokenize_with;
108/// use fhp_tokenizer::token::Token;
109///
110/// let mut count = 0;
111/// tokenize_with("<div>hello</div>", |_token| { count += 1; });
112/// assert!(count >= 3); // OpenTag, Text, CloseTag
113/// ```
114pub fn tokenize_with<'a>(input: &'a str, mut on_token: impl FnMut(Token<'a>)) {
115    let bytes = input.as_bytes();
116    let len = bytes.len();
117
118    if len == 0 {
119        return;
120    }
121
122    let dispatch = ops();
123    let compute = dispatch.compute_all_masks;
124    let mut parser = extract::Parser::new(input);
125
126    // Quote-aware masking carry bits (streaming across blocks).
127    let mut carry_dq = false;
128    let mut carry_sq = false;
129
130    let mut block_start = 0;
131
132    while block_start < len {
133        let block_end = (block_start + BLOCK_SIZE).min(len);
134        let chunk = &bytes[block_start..block_end];
135
136        // Step 1: Compute all 4 bitmasks in a single SIMD pass.
137        // SAFETY: dispatch function pointer matches detected CPU features.
138        let mut masks: AllMasks = unsafe { compute(chunk) };
139
140        // Step 2: Apply quote-aware string masking (inline, carry-based).
141        let (in_string, new_carry_dq, new_carry_sq) =
142            compute_string_mask_inline(&mut masks, carry_dq, carry_sq);
143        carry_dq = new_carry_dq;
144        carry_sq = new_carry_sq;
145
146        // Step 3: Only iterate `<` and `>` — the only delimiters that
147        // trigger state transitions. Quote masking already ran above to
148        // filter out `<`/`>` inside attribute strings.
149        let _ = in_string; // masking already applied to individual masks
150        let combined = masks.lt | masks.gt;
151
152        // Pre-mask: clear bits beyond the valid range for this chunk,
153        // eliminating the per-iteration bounds check inside the loop.
154        let valid_bits = block_end - block_start;
155        let mut bits = if valid_bits < 64 {
156            combined & ((1u64 << valid_bits) - 1)
157        } else {
158            combined
159        };
160        while bits != 0 {
161            let bit_pos = bits.trailing_zeros() as usize;
162            bits &= bits - 1; // clear lowest set bit
163
164            let abs_pos = block_start + bit_pos;
165            let byte = bytes[abs_pos];
166            parser.on_delimiter_cb(abs_pos, byte, &mut on_token);
167        }
168
169        block_start = block_end;
170    }
171
172    // Flush trailing text.
173    parser.flush_trailing_cb(&mut on_token);
174}
175
176/// Tokenize HTML directly into a [`TreeSink`], bypassing all intermediate allocations.
177///
178/// Fuses SIMD structural indexing, quote-aware masking, and token extraction
179/// into a single streaming pass. Instead of constructing `Token` enums, calls
180/// sink methods directly — eliminating `Vec<Attribute>`, `Cow` wrappers, and
181/// entity-decode `String` allocations in the hot loop.
182///
183/// # Example
184///
185/// ```
186/// use fhp_tokenizer::{TreeSink, tokenize_into};
187/// use fhp_core::tag::Tag;
188///
189/// struct Counter { tags: usize, texts: usize }
190/// impl TreeSink for Counter {
191///     fn open_tag(&mut self, _: Tag, _: &str, _: &str, _: bool) { self.tags += 1; }
192///     fn close_tag(&mut self, _: Tag, _: &str) {}
193///     fn text(&mut self, _: &str) { self.texts += 1; }
194///     fn comment(&mut self, _: &str) {}
195///     fn doctype(&mut self, _: &str) {}
196///     fn cdata(&mut self, _: &str) {}
197/// }
198///
199/// let mut c = Counter { tags: 0, texts: 0 };
200/// tokenize_into("<div>hello</div>", &mut c);
201/// assert_eq!(c.tags, 1);
202/// assert_eq!(c.texts, 1);
203/// ```
204pub fn tokenize_into<S: TreeSink>(input: &str, sink: &mut S) {
205    let bytes = input.as_bytes();
206    let len = bytes.len();
207
208    if len == 0 {
209        return;
210    }
211
212    let dispatch = ops();
213    let compute = dispatch.compute_all_masks;
214    let mut parser = extract::Parser::new(input);
215
216    // Quote-aware masking carry bits (streaming across blocks).
217    let mut carry_dq = false;
218    let mut carry_sq = false;
219
220    let mut block_start = 0;
221
222    while block_start < len {
223        let block_end = (block_start + BLOCK_SIZE).min(len);
224        let chunk = &bytes[block_start..block_end];
225
226        // Step 1: Compute all 4 bitmasks in a single SIMD pass.
227        // SAFETY: dispatch function pointer matches detected CPU features.
228        let mut masks: AllMasks = unsafe { compute(chunk) };
229
230        // Step 2: Apply quote-aware string masking (inline, carry-based).
231        let (in_string, new_carry_dq, new_carry_sq) =
232            compute_string_mask_inline(&mut masks, carry_dq, carry_sq);
233        carry_dq = new_carry_dq;
234        carry_sq = new_carry_sq;
235
236        // Step 3: Only iterate `<` and `>` — the only delimiters that
237        // trigger state transitions. Quote masking already ran above to
238        // filter out `<`/`>` inside attribute strings.
239        let _ = in_string; // masking already applied to individual masks
240        let combined = masks.lt | masks.gt;
241
242        // Pre-mask: clear bits beyond the valid range for this chunk,
243        // eliminating the per-iteration bounds check inside the loop.
244        let valid_bits = block_end - block_start;
245        let mut bits = if valid_bits < 64 {
246            combined & ((1u64 << valid_bits) - 1)
247        } else {
248            combined
249        };
250        while bits != 0 {
251            let bit_pos = bits.trailing_zeros() as usize;
252            bits &= bits - 1; // clear lowest set bit
253
254            let abs_pos = block_start + bit_pos;
255            let byte = bytes[abs_pos];
256            parser.on_delimiter_sink(abs_pos, byte, sink);
257        }
258
259        block_start = block_end;
260    }
261
262    // Flush trailing text.
263    parser.flush_trailing_sink(sink);
264}
265
266/// Compute prefix XOR of all bits in a `u64`.
267#[inline]
268fn prefix_xor(mut x: u64) -> u64 {
269    x ^= x << 1;
270    x ^= x << 2;
271    x ^= x << 4;
272    x ^= x << 8;
273    x ^= x << 16;
274    x ^= x << 32;
275    x
276}
277
278/// Apply quote-aware string masking to a block's masks, returning the
279/// combined in-string mask and updated carry bits.
280#[inline]
281fn compute_string_mask_inline(
282    masks: &mut AllMasks,
283    carry_dq: bool,
284    carry_sq: bool,
285) -> (u64, bool, bool) {
286    // Fast path: no quotes in this block and no carry — skip prefix_xor entirely.
287    if masks.quot == 0 && masks.apos == 0 && !carry_dq && !carry_sq {
288        return (0, false, false);
289    }
290
291    // Fast path: carry active but no quotes in block — entire block is inside string.
292    if masks.quot == 0 && masks.apos == 0 {
293        let in_string = !0u64;
294        masks.lt = 0;
295        masks.gt = 0;
296        return (in_string, carry_dq, carry_sq);
297    }
298
299    // Pass 1: double-quote regions.
300    let flipped_dq = prefix_xor(masks.quot);
301    let dq_in = if carry_dq { !flipped_dq } else { flipped_dq };
302    let new_carry_dq = carry_dq ^ ((masks.quot.count_ones() % 2) == 1);
303
304    // Pass 2: single-quote regions (excluding positions inside dquot).
305    let effective_apos = masks.apos & !dq_in;
306    let flipped_sq = prefix_xor(effective_apos);
307    let sq_in = if carry_sq { !flipped_sq } else { flipped_sq };
308    let new_carry_sq = carry_sq ^ ((effective_apos.count_ones() % 2) == 1);
309
310    // Combine both quote regions.
311    let in_string = dq_in | sq_in;
312
313    // Mask `<` and `>` inside strings — these are the only structural delimiters.
314    masks.lt &= !in_string;
315    masks.gt &= !in_string;
316
317    // Mask non-boundary quotes (still needed for carry propagation).
318    masks.apos &= !dq_in;
319    masks.quot &= !sq_in;
320
321    (in_string, new_carry_dq, new_carry_sq)
322}
323
324#[cfg(test)]
325mod tokenize_with_tests {
326    use super::*;
327    use token::Token;
328
329    /// Collect tokens from `tokenize_with` into a Vec for comparison.
330    fn collect_with(input: &str) -> Vec<Token<'_>> {
331        let mut tokens = Vec::new();
332        tokenize_with(input, |t| tokens.push(t));
333        tokens
334    }
335
336    /// Compare `tokenize` and `tokenize_with` for equivalence.
337    fn assert_equivalent(html: &str) {
338        let vec_tokens = tokenize(html);
339        let cb_tokens = collect_with(html);
340        assert_eq!(
341            vec_tokens.len(),
342            cb_tokens.len(),
343            "token count mismatch for {:?}",
344            html
345        );
346        for (i, (a, b)) in vec_tokens.iter().zip(cb_tokens.iter()).enumerate() {
347            assert_eq!(
348                std::mem::discriminant(a),
349                std::mem::discriminant(b),
350                "token type mismatch at index {i} for {:?}: {a:?} vs {b:?}",
351                html
352            );
353        }
354    }
355
356    #[test]
357    fn equiv_simple_div() {
358        assert_equivalent("<div>hello</div>");
359    }
360
361    #[test]
362    fn equiv_self_closing() {
363        assert_equivalent("<br/>");
364    }
365
366    #[test]
367    fn equiv_attributes() {
368        assert_equivalent("<a href=\"url\" class=\"link\">text</a>");
369    }
370
371    #[test]
372    fn equiv_comment() {
373        assert_equivalent("<!-- hello -->");
374    }
375
376    #[test]
377    fn equiv_doctype() {
378        assert_equivalent("<!DOCTYPE html>");
379    }
380
381    #[test]
382    fn equiv_script() {
383        assert_equivalent("<script>var x = 1 < 2;</script>");
384    }
385
386    #[test]
387    fn equiv_entities() {
388        assert_equivalent("a &amp; b");
389    }
390
391    #[test]
392    fn equiv_entity_in_attr() {
393        assert_equivalent("<div title=\"a &amp; b\">x</div>");
394    }
395
396    #[test]
397    fn equiv_empty() {
398        assert_equivalent("");
399    }
400
401    #[test]
402    fn equiv_text_only() {
403        assert_equivalent("just plain text");
404    }
405
406    #[test]
407    fn equiv_nested() {
408        assert_equivalent("<div><span>text</span></div>");
409    }
410
411    #[test]
412    fn equiv_quotes_spanning_blocks() {
413        // Build input with attribute value spanning a 64-byte block boundary.
414        let mut input = String::from("<div data=\"");
415        while input.len() < 60 {
416            input.push('a');
417        }
418        input.push('<'); // delimiter inside quotes
419        while input.len() < 80 {
420            input.push('b');
421        }
422        input.push_str("\">end</div>");
423        assert_equivalent(&input);
424    }
425
426    #[test]
427    fn equiv_long_input() {
428        let mut input = String::new();
429        for i in 0..100 {
430            input.push_str(&format!("<p class=\"c{i}\">text {i}</p>"));
431        }
432        assert_equivalent(&input);
433    }
434
435    #[test]
436    fn equiv_mixed_content() {
437        assert_equivalent(
438            "<!DOCTYPE html><!-- comment --><div id=\"main\" class='header' disabled>\
439             <p>Hello &amp; world</p><br/><script>if(a<b){}</script></div>",
440        );
441    }
442}