fhp_tokenizer/lib.rs
1//! SIMD-accelerated HTML tokenizer.
2//!
3//! Uses a two-stage pipeline inspired by simdjson:
4//!
5//! 1. **Structural indexing** (SIMD): scan input in 64-byte blocks, produce
6//! per-delimiter bitmasks, then apply quote-aware masking.
7//! 2. **Token extraction** (scalar): walk the structural index to emit tokens
8//! via a branchless state machine.
9//!
10//! # Quick Start
11//!
12//! ```
13//! use fhp_tokenizer::tokenize;
14//!
15//! let tokens = tokenize("<div>hello</div>");
16//! assert!(tokens.len() >= 3);
17//! ```
18
19/// Entity decoding with SIMD fast-path.
20pub mod entity;
21/// Token extraction — stage 2 (scalar state machine).
22pub mod extract;
23/// Branchless state machine for token extraction.
24pub mod state_machine;
25/// Streaming (chunk-based) tokenizer — see [`crate::streaming::StreamTokenizer`].
26pub mod streaming;
27/// Structural character indexer — SIMD-powered bitmask generation (stage 1).
28pub mod structural;
29/// Token types emitted by the tokenizer.
30pub mod token;
31
32use extract::extract_tokens;
33use structural::StructuralIndexer;
34use token::Token;
35
36use fhp_core::tag::Tag;
37use fhp_simd::AllMasks;
38use fhp_simd::dispatch::ops;
39
40/// Trait for receiving parsed HTML events directly, bypassing Token allocation.
41///
42/// Implementors receive raw parse events (open/close tags, text, etc.) and
43/// can build their output structure in a single pass without intermediate
44/// `Token` enum or `Vec<Attribute>` allocations.
45pub trait TreeSink {
46 /// An opening tag was encountered.
47 ///
48 /// `attr_raw` is the raw attribute region between the tag name and `>`.
49 /// The implementor is responsible for parsing attributes from this slice.
50 fn open_tag(&mut self, tag: Tag, name: &str, attr_raw: &str, self_closing: bool);
51
52 /// A closing tag was encountered.
53 fn close_tag(&mut self, tag: Tag, name: &str);
54
55 /// Raw text content (not entity-decoded).
56 ///
57 /// The implementor is responsible for entity decoding if desired.
58 fn text(&mut self, content: &str);
59
60 /// A comment was encountered.
61 fn comment(&mut self, content: &str);
62
63 /// A DOCTYPE declaration was encountered.
64 fn doctype(&mut self, content: &str);
65
66 /// A CDATA section was encountered.
67 fn cdata(&mut self, content: &str);
68}
69
70/// Tokenize an HTML string into a sequence of tokens.
71///
72/// Convenience wrapper that runs both pipeline stages:
73/// 1. SIMD structural indexing
74/// 2. State-machine token extraction
75///
76/// # Example
77///
78/// ```
79/// use fhp_tokenizer::tokenize;
80/// use fhp_tokenizer::token::Token;
81///
82/// let tokens = tokenize("<p>Hello & world</p>");
83///
84/// // Should contain OpenTag, Text, CloseTag
85/// assert!(tokens.iter().any(|t| matches!(t, Token::OpenTag { .. })));
86/// assert!(tokens.iter().any(|t| matches!(t, Token::Text { .. })));
87/// assert!(tokens.iter().any(|t| matches!(t, Token::CloseTag { .. })));
88/// ```
89pub fn tokenize<'a>(input: &'a str) -> Vec<Token<'a>> {
90 let indexer = StructuralIndexer::new();
91 let index = indexer.index(input.as_bytes());
92 extract_tokens(input, &index)
93}
94
95/// Size of each processing block in bytes.
96const BLOCK_SIZE: usize = 64;
97
98/// Tokenize HTML and feed each token to a callback — zero intermediate allocation.
99///
100/// Fuses SIMD structural indexing, quote-aware masking, and token extraction
101/// into a single streaming pass. No `Vec<BlockBitmaps>` or `Vec<Token>` is
102/// allocated.
103///
104/// # Example
105///
106/// ```
107/// use fhp_tokenizer::tokenize_with;
108/// use fhp_tokenizer::token::Token;
109///
110/// let mut count = 0;
111/// tokenize_with("<div>hello</div>", |_token| { count += 1; });
112/// assert!(count >= 3); // OpenTag, Text, CloseTag
113/// ```
114pub fn tokenize_with<'a>(input: &'a str, mut on_token: impl FnMut(Token<'a>)) {
115 let bytes = input.as_bytes();
116 let len = bytes.len();
117
118 if len == 0 {
119 return;
120 }
121
122 let dispatch = ops();
123 let compute = dispatch.compute_all_masks;
124 let mut parser = extract::Parser::new(input);
125
126 // Quote-aware masking carry bits (streaming across blocks).
127 let mut carry_dq = false;
128 let mut carry_sq = false;
129
130 let mut block_start = 0;
131
132 while block_start < len {
133 let block_end = (block_start + BLOCK_SIZE).min(len);
134 let chunk = &bytes[block_start..block_end];
135
136 // Step 1: Compute all 4 bitmasks in a single SIMD pass.
137 // SAFETY: dispatch function pointer matches detected CPU features.
138 let mut masks: AllMasks = unsafe { compute(chunk) };
139
140 // Step 2: Apply quote-aware string masking (inline, carry-based).
141 let (in_string, new_carry_dq, new_carry_sq) =
142 compute_string_mask_inline(&mut masks, carry_dq, carry_sq);
143 carry_dq = new_carry_dq;
144 carry_sq = new_carry_sq;
145
146 // Step 3: Only iterate `<` and `>` — the only delimiters that
147 // trigger state transitions. Quote masking already ran above to
148 // filter out `<`/`>` inside attribute strings.
149 let _ = in_string; // masking already applied to individual masks
150 let combined = masks.lt | masks.gt;
151
152 // Pre-mask: clear bits beyond the valid range for this chunk,
153 // eliminating the per-iteration bounds check inside the loop.
154 let valid_bits = block_end - block_start;
155 let mut bits = if valid_bits < 64 {
156 combined & ((1u64 << valid_bits) - 1)
157 } else {
158 combined
159 };
160 while bits != 0 {
161 let bit_pos = bits.trailing_zeros() as usize;
162 bits &= bits - 1; // clear lowest set bit
163
164 let abs_pos = block_start + bit_pos;
165 let byte = bytes[abs_pos];
166 parser.on_delimiter_cb(abs_pos, byte, &mut on_token);
167 }
168
169 block_start = block_end;
170 }
171
172 // Flush trailing text.
173 parser.flush_trailing_cb(&mut on_token);
174}
175
176/// Tokenize HTML directly into a [`TreeSink`], bypassing all intermediate allocations.
177///
178/// Fuses SIMD structural indexing, quote-aware masking, and token extraction
179/// into a single streaming pass. Instead of constructing `Token` enums, calls
180/// sink methods directly — eliminating `Vec<Attribute>`, `Cow` wrappers, and
181/// entity-decode `String` allocations in the hot loop.
182///
183/// # Example
184///
185/// ```
186/// use fhp_tokenizer::{TreeSink, tokenize_into};
187/// use fhp_core::tag::Tag;
188///
189/// struct Counter { tags: usize, texts: usize }
190/// impl TreeSink for Counter {
191/// fn open_tag(&mut self, _: Tag, _: &str, _: &str, _: bool) { self.tags += 1; }
192/// fn close_tag(&mut self, _: Tag, _: &str) {}
193/// fn text(&mut self, _: &str) { self.texts += 1; }
194/// fn comment(&mut self, _: &str) {}
195/// fn doctype(&mut self, _: &str) {}
196/// fn cdata(&mut self, _: &str) {}
197/// }
198///
199/// let mut c = Counter { tags: 0, texts: 0 };
200/// tokenize_into("<div>hello</div>", &mut c);
201/// assert_eq!(c.tags, 1);
202/// assert_eq!(c.texts, 1);
203/// ```
204pub fn tokenize_into<S: TreeSink>(input: &str, sink: &mut S) {
205 let bytes = input.as_bytes();
206 let len = bytes.len();
207
208 if len == 0 {
209 return;
210 }
211
212 let dispatch = ops();
213 let compute = dispatch.compute_all_masks;
214 let mut parser = extract::Parser::new(input);
215
216 // Quote-aware masking carry bits (streaming across blocks).
217 let mut carry_dq = false;
218 let mut carry_sq = false;
219
220 let mut block_start = 0;
221
222 while block_start < len {
223 let block_end = (block_start + BLOCK_SIZE).min(len);
224 let chunk = &bytes[block_start..block_end];
225
226 // Step 1: Compute all 4 bitmasks in a single SIMD pass.
227 // SAFETY: dispatch function pointer matches detected CPU features.
228 let mut masks: AllMasks = unsafe { compute(chunk) };
229
230 // Step 2: Apply quote-aware string masking (inline, carry-based).
231 let (in_string, new_carry_dq, new_carry_sq) =
232 compute_string_mask_inline(&mut masks, carry_dq, carry_sq);
233 carry_dq = new_carry_dq;
234 carry_sq = new_carry_sq;
235
236 // Step 3: Only iterate `<` and `>` — the only delimiters that
237 // trigger state transitions. Quote masking already ran above to
238 // filter out `<`/`>` inside attribute strings.
239 let _ = in_string; // masking already applied to individual masks
240 let combined = masks.lt | masks.gt;
241
242 // Pre-mask: clear bits beyond the valid range for this chunk,
243 // eliminating the per-iteration bounds check inside the loop.
244 let valid_bits = block_end - block_start;
245 let mut bits = if valid_bits < 64 {
246 combined & ((1u64 << valid_bits) - 1)
247 } else {
248 combined
249 };
250 while bits != 0 {
251 let bit_pos = bits.trailing_zeros() as usize;
252 bits &= bits - 1; // clear lowest set bit
253
254 let abs_pos = block_start + bit_pos;
255 let byte = bytes[abs_pos];
256 parser.on_delimiter_sink(abs_pos, byte, sink);
257 }
258
259 block_start = block_end;
260 }
261
262 // Flush trailing text.
263 parser.flush_trailing_sink(sink);
264}
265
266/// Compute prefix XOR of all bits in a `u64`.
267#[inline]
268fn prefix_xor(mut x: u64) -> u64 {
269 x ^= x << 1;
270 x ^= x << 2;
271 x ^= x << 4;
272 x ^= x << 8;
273 x ^= x << 16;
274 x ^= x << 32;
275 x
276}
277
278/// Apply quote-aware string masking to a block's masks, returning the
279/// combined in-string mask and updated carry bits.
280#[inline]
281fn compute_string_mask_inline(
282 masks: &mut AllMasks,
283 carry_dq: bool,
284 carry_sq: bool,
285) -> (u64, bool, bool) {
286 // Fast path: no quotes in this block and no carry — skip prefix_xor entirely.
287 if masks.quot == 0 && masks.apos == 0 && !carry_dq && !carry_sq {
288 return (0, false, false);
289 }
290
291 // Fast path: carry active but no quotes in block — entire block is inside string.
292 if masks.quot == 0 && masks.apos == 0 {
293 let in_string = !0u64;
294 masks.lt = 0;
295 masks.gt = 0;
296 return (in_string, carry_dq, carry_sq);
297 }
298
299 // Pass 1: double-quote regions.
300 let flipped_dq = prefix_xor(masks.quot);
301 let dq_in = if carry_dq { !flipped_dq } else { flipped_dq };
302 let new_carry_dq = carry_dq ^ ((masks.quot.count_ones() % 2) == 1);
303
304 // Pass 2: single-quote regions (excluding positions inside dquot).
305 let effective_apos = masks.apos & !dq_in;
306 let flipped_sq = prefix_xor(effective_apos);
307 let sq_in = if carry_sq { !flipped_sq } else { flipped_sq };
308 let new_carry_sq = carry_sq ^ ((effective_apos.count_ones() % 2) == 1);
309
310 // Combine both quote regions.
311 let in_string = dq_in | sq_in;
312
313 // Mask `<` and `>` inside strings — these are the only structural delimiters.
314 masks.lt &= !in_string;
315 masks.gt &= !in_string;
316
317 // Mask non-boundary quotes (still needed for carry propagation).
318 masks.apos &= !dq_in;
319 masks.quot &= !sq_in;
320
321 (in_string, new_carry_dq, new_carry_sq)
322}
323
324#[cfg(test)]
325mod tokenize_with_tests {
326 use super::*;
327 use token::Token;
328
329 /// Collect tokens from `tokenize_with` into a Vec for comparison.
330 fn collect_with(input: &str) -> Vec<Token<'_>> {
331 let mut tokens = Vec::new();
332 tokenize_with(input, |t| tokens.push(t));
333 tokens
334 }
335
336 /// Compare `tokenize` and `tokenize_with` for equivalence.
337 fn assert_equivalent(html: &str) {
338 let vec_tokens = tokenize(html);
339 let cb_tokens = collect_with(html);
340 assert_eq!(
341 vec_tokens.len(),
342 cb_tokens.len(),
343 "token count mismatch for {:?}",
344 html
345 );
346 for (i, (a, b)) in vec_tokens.iter().zip(cb_tokens.iter()).enumerate() {
347 assert_eq!(
348 std::mem::discriminant(a),
349 std::mem::discriminant(b),
350 "token type mismatch at index {i} for {:?}: {a:?} vs {b:?}",
351 html
352 );
353 }
354 }
355
356 #[test]
357 fn equiv_simple_div() {
358 assert_equivalent("<div>hello</div>");
359 }
360
361 #[test]
362 fn equiv_self_closing() {
363 assert_equivalent("<br/>");
364 }
365
366 #[test]
367 fn equiv_attributes() {
368 assert_equivalent("<a href=\"url\" class=\"link\">text</a>");
369 }
370
371 #[test]
372 fn equiv_comment() {
373 assert_equivalent("<!-- hello -->");
374 }
375
376 #[test]
377 fn equiv_doctype() {
378 assert_equivalent("<!DOCTYPE html>");
379 }
380
381 #[test]
382 fn equiv_script() {
383 assert_equivalent("<script>var x = 1 < 2;</script>");
384 }
385
386 #[test]
387 fn equiv_entities() {
388 assert_equivalent("a & b");
389 }
390
391 #[test]
392 fn equiv_entity_in_attr() {
393 assert_equivalent("<div title=\"a & b\">x</div>");
394 }
395
396 #[test]
397 fn equiv_empty() {
398 assert_equivalent("");
399 }
400
401 #[test]
402 fn equiv_text_only() {
403 assert_equivalent("just plain text");
404 }
405
406 #[test]
407 fn equiv_nested() {
408 assert_equivalent("<div><span>text</span></div>");
409 }
410
411 #[test]
412 fn equiv_quotes_spanning_blocks() {
413 // Build input with attribute value spanning a 64-byte block boundary.
414 let mut input = String::from("<div data=\"");
415 while input.len() < 60 {
416 input.push('a');
417 }
418 input.push('<'); // delimiter inside quotes
419 while input.len() < 80 {
420 input.push('b');
421 }
422 input.push_str("\">end</div>");
423 assert_equivalent(&input);
424 }
425
426 #[test]
427 fn equiv_long_input() {
428 let mut input = String::new();
429 for i in 0..100 {
430 input.push_str(&format!("<p class=\"c{i}\">text {i}</p>"));
431 }
432 assert_equivalent(&input);
433 }
434
435 #[test]
436 fn equiv_mixed_content() {
437 assert_equivalent(
438 "<!DOCTYPE html><!-- comment --><div id=\"main\" class='header' disabled>\
439 <p>Hello & world</p><br/><script>if(a<b){}</script></div>",
440 );
441 }
442}