Skip to main content

fhp_tokenizer/
structural.rs

1//! Structural character indexer for HTML input.
2//!
3//! Scans input in 64-byte blocks using SIMD dispatch, producing per-delimiter
4//! u64 bitmasks. Quote-aware masking (prefix XOR) ensures that delimiters
5//! inside quoted attribute values are not treated as structural.
6//!
7//! This is stage 1 of the two-stage tokenizer pipeline (see crate docs).
8
9use fhp_simd::dispatch::{SimdOps, ops};
10
11/// Size of each processing block in bytes.
12const BLOCK_SIZE: usize = 64;
13
14/// Per-block bitmasks for each HTML delimiter type.
15///
16/// Bit `i` of a mask is set if the byte at position `i` within the block
17/// matches that delimiter. After quote-aware masking, non-structural
18/// positions (inside quoted strings) are cleared from the relevant masks.
19#[derive(Clone, Debug, Default)]
20pub struct BlockBitmaps {
21    /// `<` positions.
22    pub lt: u64,
23    /// `>` positions.
24    pub gt: u64,
25    /// `&` positions.
26    pub amp: u64,
27    /// `"` positions.
28    pub quot: u64,
29    /// `'` positions.
30    pub apos: u64,
31    /// `=` positions.
32    pub eq: u64,
33    /// `/` positions.
34    pub slash: u64,
35    /// Positions inside quoted strings (both `"` and `'`).
36    /// Set bits indicate bytes that are *not* structural.
37    pub in_string: u64,
38}
39
40/// Result of structural indexing: a sequence of [`BlockBitmaps`] covering
41/// the entire input.
42pub struct StructuralIndex {
43    bitmaps: Vec<BlockBitmaps>,
44    len: usize,
45}
46
47impl StructuralIndex {
48    /// Iterate over all structural delimiter positions in input order.
49    pub fn iter_delimiters(&self) -> DelimiterIter<'_> {
50        let mut iter = DelimiterIter {
51            bitmaps: &self.bitmaps,
52            block_idx: 0,
53            combined: 0,
54            len: self.len,
55        };
56        // Load the first block's combined mask.
57        if !self.bitmaps.is_empty() {
58            iter.combined = Self::combined_mask(&self.bitmaps[0]);
59        }
60        iter
61    }
62
63    /// Estimated number of tokens — used for `Vec` pre-allocation.
64    ///
65    /// Heuristic: roughly one token per pair of `<`/`>` delimiters,
66    /// plus text segments between them.
67    pub fn estimated_token_count(&self) -> usize {
68        let total_lt: u32 = self.bitmaps.iter().map(|b| b.lt.count_ones()).sum();
69        // Each `<` roughly corresponds to one tag token. Add 1 for trailing text.
70        total_lt as usize + 1
71    }
72
73    /// Total input length in bytes.
74    pub fn input_len(&self) -> usize {
75        self.len
76    }
77
78    /// Number of 64-byte blocks.
79    pub fn block_count(&self) -> usize {
80        self.bitmaps.len()
81    }
82
83    /// Access the bitmaps for a specific block.
84    pub fn block(&self, index: usize) -> &BlockBitmaps {
85        &self.bitmaps[index]
86    }
87
88    /// OR of all delimiter masks for a block (excluding `in_string`).
89    fn combined_mask(block: &BlockBitmaps) -> u64 {
90        block.lt | block.gt | block.amp | block.quot | block.apos | block.eq | block.slash
91    }
92}
93
94/// A structural delimiter found by the indexer.
95#[derive(Clone, Copy, Debug, PartialEq, Eq)]
96pub struct DelimiterEntry {
97    /// Absolute byte offset in the input.
98    pub pos: usize,
99    /// The delimiter byte at this position.
100    pub byte: u8,
101}
102
103/// Iterator over structural delimiter positions, yielded in ascending order.
104pub struct DelimiterIter<'a> {
105    bitmaps: &'a [BlockBitmaps],
106    block_idx: usize,
107    combined: u64,
108    len: usize,
109}
110
111impl<'a> Iterator for DelimiterIter<'a> {
112    type Item = DelimiterEntry;
113
114    #[inline]
115    fn next(&mut self) -> Option<Self::Item> {
116        loop {
117            if self.combined != 0 {
118                let bit = self.combined.trailing_zeros() as usize;
119                // Clear lowest set bit.
120                self.combined &= self.combined - 1;
121
122                let pos = self.block_idx * BLOCK_SIZE + bit;
123                if pos >= self.len {
124                    return None;
125                }
126
127                let block = &self.bitmaps[self.block_idx];
128                let byte = determine_byte(block, bit);
129                return Some(DelimiterEntry { pos, byte });
130            }
131
132            // Advance to next non-empty block.
133            self.block_idx += 1;
134            if self.block_idx >= self.bitmaps.len() {
135                return None;
136            }
137            self.combined = StructuralIndex::combined_mask(&self.bitmaps[self.block_idx]);
138        }
139    }
140}
141
142/// Determine which delimiter byte is at bit position `bit` within `block`.
143#[inline]
144fn determine_byte(block: &BlockBitmaps, bit: usize) -> u8 {
145    if (block.lt >> bit) & 1 == 1 {
146        b'<'
147    } else if (block.gt >> bit) & 1 == 1 {
148        b'>'
149    } else if (block.amp >> bit) & 1 == 1 {
150        b'&'
151    } else if (block.quot >> bit) & 1 == 1 {
152        b'"'
153    } else if (block.apos >> bit) & 1 == 1 {
154        b'\''
155    } else if (block.eq >> bit) & 1 == 1 {
156        b'='
157    } else {
158        b'/'
159    }
160}
161
162/// SIMD-powered structural character indexer.
163///
164/// Scans input in 64-byte blocks, producing bitmasks for each delimiter
165/// type. Quote-aware masking ensures delimiters inside `"..."` or `'...'`
166/// are not flagged as structural.
167///
168/// # Example
169///
170/// ```
171/// use fhp_tokenizer::structural::StructuralIndexer;
172///
173/// let indexer = StructuralIndexer::new();
174/// let index = indexer.index(b"<div class=\"foo\">bar</div>");
175///
176/// let delimiters: Vec<_> = index.iter_delimiters().collect();
177/// assert!(delimiters.len() > 0);
178/// ```
179pub struct StructuralIndexer {
180    dispatch: &'static SimdOps,
181}
182
183impl StructuralIndexer {
184    /// Create a new indexer using the auto-detected SIMD backend.
185    pub fn new() -> Self {
186        Self { dispatch: ops() }
187    }
188
189    /// Scan `input` and produce a [`StructuralIndex`].
190    ///
191    /// The input is processed in 64-byte blocks. For each block, a
192    /// [`BlockBitmaps`] is produced with bitmasks for every HTML delimiter.
193    /// Quote-aware masking is then applied to clear non-structural positions.
194    pub fn index(&self, input: &[u8]) -> StructuralIndex {
195        let block_count = input.len().div_ceil(BLOCK_SIZE);
196        let mut bitmaps = Vec::with_capacity(block_count);
197
198        for chunk in input.chunks(BLOCK_SIZE) {
199            // SAFETY: dispatch function pointers are initialized from backends
200            // that match the detected CPU features.
201            let compute = self.dispatch.compute_byte_mask;
202            let lt = unsafe { compute(chunk, b'<') };
203            let gt = unsafe { compute(chunk, b'>') };
204            let amp = unsafe { compute(chunk, b'&') };
205            let quot = unsafe { compute(chunk, b'"') };
206            let apos = unsafe { compute(chunk, b'\'') };
207            let eq = unsafe { compute(chunk, b'=') };
208            let slash = unsafe { compute(chunk, b'/') };
209
210            bitmaps.push(BlockBitmaps {
211                lt,
212                gt,
213                amp,
214                quot,
215                apos,
216                eq,
217                slash,
218                in_string: 0,
219            });
220        }
221
222        Self::compute_string_masks(&mut bitmaps);
223
224        StructuralIndex {
225            bitmaps,
226            len: input.len(),
227        }
228    }
229
230    /// Compute quote-aware string masks using prefix XOR.
231    ///
232    /// Two passes:
233    /// 1. Double-quote (`"`) regions via prefix XOR + carry.
234    /// 2. Single-quote (`'`) regions, ignoring positions already inside
235    ///    double-quoted strings.
236    ///
237    /// After both passes, non-structural delimiters inside strings are
238    /// cleared from `lt`, `gt`, `amp`, `eq`, `slash`, and the non-boundary
239    /// quote masks.
240    fn compute_string_masks(bitmaps: &mut [BlockBitmaps]) {
241        // Pass 1: double-quote regions.
242        let mut carry_dq = false;
243        for block in bitmaps.iter_mut() {
244            let flipped = prefix_xor(block.quot);
245            let dq_in = if carry_dq { !flipped } else { flipped };
246            carry_dq ^= (block.quot.count_ones() % 2) == 1;
247            // Store dq_in temporarily in `in_string`.
248            block.in_string = dq_in;
249        }
250
251        // Pass 2: single-quote regions (excluding positions inside dquot).
252        let mut carry_sq = false;
253        for block in bitmaps.iter_mut() {
254            let dq_in = block.in_string;
255            // Ignore single quotes that fall inside double-quoted strings.
256            let effective_apos = block.apos & !dq_in;
257            let flipped = prefix_xor(effective_apos);
258            let sq_in = if carry_sq { !flipped } else { flipped };
259            carry_sq ^= (effective_apos.count_ones() % 2) == 1;
260
261            // Combine both quote regions.
262            block.in_string = dq_in | sq_in;
263
264            // Mask non-structural delimiters inside strings.
265            block.lt &= !block.in_string;
266            block.gt &= !block.in_string;
267            block.amp &= !block.in_string;
268            block.eq &= !block.in_string;
269            block.slash &= !block.in_string;
270
271            // Mask non-boundary quotes (apos inside dquot, quot inside squot).
272            block.apos &= !dq_in;
273            block.quot &= !sq_in;
274        }
275    }
276}
277
278impl Default for StructuralIndexer {
279    fn default() -> Self {
280        Self::new()
281    }
282}
283
284/// Compute prefix XOR of all bits in a `u64`.
285///
286/// After this operation, bit `i` equals the XOR of the original bits
287/// `0..=i`. This effectively creates a toggle mask: each set bit in the
288/// input flips all subsequent bits, turning quote positions into
289/// inside/outside-string regions.
290///
291/// On x86_64 with PCLMULQDQ this could be a single `clmul` instruction;
292/// here we use a portable cascade of shifts.
293#[inline]
294fn prefix_xor(mut x: u64) -> u64 {
295    x ^= x << 1;
296    x ^= x << 2;
297    x ^= x << 4;
298    x ^= x << 8;
299    x ^= x << 16;
300    x ^= x << 32;
301    x
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307
308    // ---------------------------------------------------------------
309    // prefix_xor unit tests
310    // ---------------------------------------------------------------
311
312    #[test]
313    fn prefix_xor_single_bit() {
314        // Bit 2 set → bits 2..63 should be set after prefix XOR.
315        let result = prefix_xor(1 << 2);
316        for i in 0..64 {
317            let bit = (result >> i) & 1;
318            if i < 2 {
319                assert_eq!(bit, 0, "bit {i} should be 0");
320            } else {
321                assert_eq!(bit, 1, "bit {i} should be 1");
322            }
323        }
324    }
325
326    #[test]
327    fn prefix_xor_two_bits() {
328        // Bits 2 and 5 → bits 2..4 should be 1, bits 5..63 should be 0.
329        let result = prefix_xor((1 << 2) | (1 << 5));
330        for i in 0..64 {
331            let bit = (result >> i) & 1;
332            if (2..5).contains(&i) {
333                assert_eq!(bit, 1, "bit {i} should be 1");
334            } else {
335                assert_eq!(bit, 0, "bit {i} should be 0");
336            }
337        }
338    }
339
340    #[test]
341    fn prefix_xor_zero() {
342        assert_eq!(prefix_xor(0), 0);
343    }
344
345    // ---------------------------------------------------------------
346    // StructuralIndexer — basic HTML
347    // ---------------------------------------------------------------
348
349    #[test]
350    fn basic_html_tag() {
351        let input = b"<div>hello</div>";
352        let indexer = StructuralIndexer::new();
353        let index = indexer.index(input);
354
355        let delims: Vec<_> = index.iter_delimiters().collect();
356
357        // Expected structural delimiters:
358        // 0: '<', 4: '>', 10: '<', 11: '/', 15: '>'
359        assert_eq!(
360            delims,
361            vec![
362                DelimiterEntry { pos: 0, byte: b'<' },
363                DelimiterEntry { pos: 4, byte: b'>' },
364                DelimiterEntry {
365                    pos: 10,
366                    byte: b'<'
367                },
368                DelimiterEntry {
369                    pos: 11,
370                    byte: b'/'
371                },
372                DelimiterEntry {
373                    pos: 15,
374                    byte: b'>'
375                },
376            ]
377        );
378    }
379
380    #[test]
381    fn html_with_attributes() {
382        let input = b"<div class=\"foo\">bar</div>";
383        let indexer = StructuralIndexer::new();
384        let index = indexer.index(input);
385
386        let delims: Vec<_> = index.iter_delimiters().collect();
387
388        // '<' at 0, '=' at 10, '"' at 11, '"' at 15, '>' at 16,
389        // '<' at 20, '/' at 21, '>' at 25
390        // Note: "foo" is inside quotes, but the content "foo" has no delimiters.
391        let bytes: Vec<u8> = delims.iter().map(|d| d.byte).collect();
392        assert!(bytes.contains(&b'<'));
393        assert!(bytes.contains(&b'>'));
394        assert!(bytes.contains(&b'='));
395        assert!(bytes.contains(&b'"'));
396    }
397
398    // ---------------------------------------------------------------
399    // Quote-aware masking
400    // ---------------------------------------------------------------
401
402    #[test]
403    fn delimiters_inside_double_quotes_masked() {
404        // The < and > inside the attribute value should NOT appear.
405        let input = b"<a title=\"x > y < z\">link</a>";
406        let indexer = StructuralIndexer::new();
407        let index = indexer.index(input);
408
409        let delims: Vec<_> = index.iter_delimiters().collect();
410        let positions: Vec<usize> = delims.iter().map(|d| d.pos).collect();
411
412        // '<' at 0 (opening tag)
413        assert!(positions.contains(&0));
414        // '>' at 20 (closing >)
415        assert!(positions.contains(&20));
416        // '<' at 25 (closing tag </a>)
417        assert!(positions.contains(&25));
418
419        // The '>' at 12 and '<' at 16 should NOT be present (inside quotes).
420        assert!(!positions.contains(&12), "> inside quotes should be masked");
421        assert!(!positions.contains(&16), "< inside quotes should be masked");
422    }
423
424    #[test]
425    fn delimiters_inside_single_quotes_masked() {
426        let input = b"<a title='x > y'>link</a>";
427        let indexer = StructuralIndexer::new();
428        let index = indexer.index(input);
429
430        let delims: Vec<_> = index.iter_delimiters().collect();
431        let positions: Vec<usize> = delims.iter().map(|d| d.pos).collect();
432
433        // '>' at 12 inside single quotes should NOT be present.
434        assert!(
435            !positions.contains(&12),
436            "> inside single quotes should be masked"
437        );
438    }
439
440    #[test]
441    fn single_quote_inside_double_quotes_ignored() {
442        // The ' inside "it's" should not affect masking.
443        let input = b"<div title=\"it's\">text</div>";
444        let indexer = StructuralIndexer::new();
445        let index = indexer.index(input);
446
447        let delims: Vec<_> = index.iter_delimiters().collect();
448        let bytes: Vec<u8> = delims.iter().map(|d| d.byte).collect();
449
450        // Should have structural delimiters for the tag structure.
451        assert!(bytes.contains(&b'<'));
452        assert!(bytes.contains(&b'>'));
453        // The tag should close properly — '>' after the closing quote.
454        let gt_positions: Vec<usize> = delims
455            .iter()
456            .filter(|d| d.byte == b'>')
457            .map(|d| d.pos)
458            .collect();
459        assert!(
460            gt_positions.contains(&17),
461            "closing > at 17 should be structural"
462        );
463    }
464
465    // ---------------------------------------------------------------
466    // Edge cases
467    // ---------------------------------------------------------------
468
469    #[test]
470    fn empty_input() {
471        let indexer = StructuralIndexer::new();
472        let index = indexer.index(b"");
473
474        assert_eq!(index.input_len(), 0);
475        assert_eq!(index.block_count(), 0);
476        assert_eq!(index.iter_delimiters().count(), 0);
477        assert_eq!(index.estimated_token_count(), 1);
478    }
479
480    #[test]
481    fn text_only_no_delimiters() {
482        let input = b"hello world this is plain text without any tags";
483        let indexer = StructuralIndexer::new();
484        let index = indexer.index(input);
485
486        assert_eq!(index.iter_delimiters().count(), 0);
487        assert_eq!(index.estimated_token_count(), 1);
488    }
489
490    #[test]
491    fn tag_only() {
492        let input = b"<br/>";
493        let indexer = StructuralIndexer::new();
494        let index = indexer.index(input);
495
496        let delims: Vec<_> = index.iter_delimiters().collect();
497        assert_eq!(
498            delims,
499            vec![
500                DelimiterEntry { pos: 0, byte: b'<' },
501                DelimiterEntry { pos: 3, byte: b'/' },
502                DelimiterEntry { pos: 4, byte: b'>' },
503            ]
504        );
505    }
506
507    #[test]
508    fn entity_reference() {
509        let input = b"a &amp; b";
510        let indexer = StructuralIndexer::new();
511        let index = indexer.index(input);
512
513        let amp_entries: Vec<_> = index.iter_delimiters().filter(|d| d.byte == b'&').collect();
514        assert_eq!(amp_entries.len(), 1);
515        assert_eq!(amp_entries[0].pos, 2);
516    }
517
518    // ---------------------------------------------------------------
519    // Long input (crosses 64-byte block boundaries)
520    // ---------------------------------------------------------------
521
522    #[test]
523    fn long_input_multiple_blocks() {
524        // Build a 1000+ byte input with tags at various positions.
525        let mut input = Vec::with_capacity(1200);
526        // Fill with text.
527        input.extend_from_slice(&[b'x'; 100]);
528        // Tag at offset 100.
529        input.extend_from_slice(b"<div>");
530        // More text.
531        input.extend_from_slice(&[b'y'; 200]);
532        // Tag at offset 305.
533        input.extend_from_slice(b"<span class=\"test\">");
534        // More text.
535        input.extend_from_slice(&[b'z'; 700]);
536        // Closing tags.
537        input.extend_from_slice(b"</span></div>");
538
539        let indexer = StructuralIndexer::new();
540        let index = indexer.index(&input);
541
542        assert!(input.len() > 1000);
543        assert!(index.block_count() > 15);
544
545        let delims: Vec<_> = index.iter_delimiters().collect();
546
547        // Verify '<' at offset 100.
548        assert!(
549            delims.iter().any(|d| d.pos == 100 && d.byte == b'<'),
550            "should find < at offset 100"
551        );
552
553        // Verify delimiters inside "test" are masked.
554        // The attribute value "test" contains no delimiters, so nothing to mask.
555        // But the = and " around it should be structural.
556        assert!(
557            delims.iter().any(|d| d.byte == b'='),
558            "should find = in attribute"
559        );
560
561        // Verify there are closing tags at the end.
562        let last_lt = delims.iter().rev().find(|d| d.byte == b'<');
563        assert!(last_lt.is_some());
564    }
565
566    #[test]
567    fn long_input_with_quotes_spanning_blocks() {
568        // Attribute value that spans a 64-byte block boundary.
569        let mut input = Vec::new();
570        input.extend_from_slice(b"<div data=\"");
571        // Pad to 60 bytes total (attribute value starts at offset 11).
572        while input.len() < 60 {
573            input.push(b'a');
574        }
575        // Add a '<' inside the string at offset 60 (inside a block boundary region).
576        input.push(b'<');
577        // Continue the string value past the 64-byte boundary.
578        while input.len() < 80 {
579            input.push(b'b');
580        }
581        // Close the attribute and tag.
582        input.extend_from_slice(b"\">end</div>");
583
584        let indexer = StructuralIndexer::new();
585        let index = indexer.index(&input);
586
587        let delims: Vec<_> = index.iter_delimiters().collect();
588        let lt_positions: Vec<usize> = delims
589            .iter()
590            .filter(|d| d.byte == b'<')
591            .map(|d| d.pos)
592            .collect();
593
594        // The '<' at offset 60 should be masked (inside double-quoted string).
595        assert!(
596            !lt_positions.contains(&60),
597            "< at offset 60 should be masked (inside quoted string)"
598        );
599
600        // The opening '<' at 0 and closing '<' should be present.
601        assert!(lt_positions.contains(&0), "opening < should be structural");
602    }
603
604    // ---------------------------------------------------------------
605    // estimated_token_count
606    // ---------------------------------------------------------------
607
608    #[test]
609    fn estimated_token_count_basic() {
610        let input = b"<div>hello</div><p>world</p>";
611        let indexer = StructuralIndexer::new();
612        let index = indexer.index(input);
613
614        // 4 '<' characters → estimate = 5.
615        assert_eq!(index.estimated_token_count(), 5);
616    }
617
618    // ---------------------------------------------------------------
619    // StructuralIndex API
620    // ---------------------------------------------------------------
621
622    #[test]
623    fn block_api() {
624        let input = b"<div>text</div>";
625        let indexer = StructuralIndexer::new();
626        let index = indexer.index(input);
627
628        assert_eq!(index.input_len(), 15);
629        assert_eq!(index.block_count(), 1);
630
631        let block = index.block(0);
632        assert_ne!(block.lt, 0, "should have < bits set");
633        assert_ne!(block.gt, 0, "should have > bits set");
634    }
635}