1use fhp_simd::dispatch::{SimdOps, ops};
10
11const BLOCK_SIZE: usize = 64;
13
14#[derive(Clone, Debug, Default)]
20pub struct BlockBitmaps {
21 pub lt: u64,
23 pub gt: u64,
25 pub amp: u64,
27 pub quot: u64,
29 pub apos: u64,
31 pub eq: u64,
33 pub slash: u64,
35 pub in_string: u64,
38}
39
40pub struct StructuralIndex {
43 bitmaps: Vec<BlockBitmaps>,
44 len: usize,
45}
46
47impl StructuralIndex {
48 pub fn iter_delimiters(&self) -> DelimiterIter<'_> {
50 let mut iter = DelimiterIter {
51 bitmaps: &self.bitmaps,
52 block_idx: 0,
53 combined: 0,
54 len: self.len,
55 };
56 if !self.bitmaps.is_empty() {
58 iter.combined = Self::combined_mask(&self.bitmaps[0]);
59 }
60 iter
61 }
62
63 pub fn estimated_token_count(&self) -> usize {
68 let total_lt: u32 = self.bitmaps.iter().map(|b| b.lt.count_ones()).sum();
69 total_lt as usize + 1
71 }
72
73 pub fn input_len(&self) -> usize {
75 self.len
76 }
77
78 pub fn block_count(&self) -> usize {
80 self.bitmaps.len()
81 }
82
83 pub fn block(&self, index: usize) -> &BlockBitmaps {
85 &self.bitmaps[index]
86 }
87
88 fn combined_mask(block: &BlockBitmaps) -> u64 {
90 block.lt | block.gt | block.amp | block.quot | block.apos | block.eq | block.slash
91 }
92}
93
94#[derive(Clone, Copy, Debug, PartialEq, Eq)]
96pub struct DelimiterEntry {
97 pub pos: usize,
99 pub byte: u8,
101}
102
103pub struct DelimiterIter<'a> {
105 bitmaps: &'a [BlockBitmaps],
106 block_idx: usize,
107 combined: u64,
108 len: usize,
109}
110
111impl<'a> Iterator for DelimiterIter<'a> {
112 type Item = DelimiterEntry;
113
114 #[inline]
115 fn next(&mut self) -> Option<Self::Item> {
116 loop {
117 if self.combined != 0 {
118 let bit = self.combined.trailing_zeros() as usize;
119 self.combined &= self.combined - 1;
121
122 let pos = self.block_idx * BLOCK_SIZE + bit;
123 if pos >= self.len {
124 return None;
125 }
126
127 let block = &self.bitmaps[self.block_idx];
128 let byte = determine_byte(block, bit);
129 return Some(DelimiterEntry { pos, byte });
130 }
131
132 self.block_idx += 1;
134 if self.block_idx >= self.bitmaps.len() {
135 return None;
136 }
137 self.combined = StructuralIndex::combined_mask(&self.bitmaps[self.block_idx]);
138 }
139 }
140}
141
142#[inline]
144fn determine_byte(block: &BlockBitmaps, bit: usize) -> u8 {
145 if (block.lt >> bit) & 1 == 1 {
146 b'<'
147 } else if (block.gt >> bit) & 1 == 1 {
148 b'>'
149 } else if (block.amp >> bit) & 1 == 1 {
150 b'&'
151 } else if (block.quot >> bit) & 1 == 1 {
152 b'"'
153 } else if (block.apos >> bit) & 1 == 1 {
154 b'\''
155 } else if (block.eq >> bit) & 1 == 1 {
156 b'='
157 } else {
158 b'/'
159 }
160}
161
162pub struct StructuralIndexer {
180 dispatch: &'static SimdOps,
181}
182
183impl StructuralIndexer {
184 pub fn new() -> Self {
186 Self { dispatch: ops() }
187 }
188
189 pub fn index(&self, input: &[u8]) -> StructuralIndex {
195 let block_count = input.len().div_ceil(BLOCK_SIZE);
196 let mut bitmaps = Vec::with_capacity(block_count);
197
198 for chunk in input.chunks(BLOCK_SIZE) {
199 let compute = self.dispatch.compute_byte_mask;
202 let lt = unsafe { compute(chunk, b'<') };
203 let gt = unsafe { compute(chunk, b'>') };
204 let amp = unsafe { compute(chunk, b'&') };
205 let quot = unsafe { compute(chunk, b'"') };
206 let apos = unsafe { compute(chunk, b'\'') };
207 let eq = unsafe { compute(chunk, b'=') };
208 let slash = unsafe { compute(chunk, b'/') };
209
210 bitmaps.push(BlockBitmaps {
211 lt,
212 gt,
213 amp,
214 quot,
215 apos,
216 eq,
217 slash,
218 in_string: 0,
219 });
220 }
221
222 Self::compute_string_masks(&mut bitmaps);
223
224 StructuralIndex {
225 bitmaps,
226 len: input.len(),
227 }
228 }
229
230 fn compute_string_masks(bitmaps: &mut [BlockBitmaps]) {
241 let mut carry_dq = false;
243 for block in bitmaps.iter_mut() {
244 let flipped = prefix_xor(block.quot);
245 let dq_in = if carry_dq { !flipped } else { flipped };
246 carry_dq ^= (block.quot.count_ones() % 2) == 1;
247 block.in_string = dq_in;
249 }
250
251 let mut carry_sq = false;
253 for block in bitmaps.iter_mut() {
254 let dq_in = block.in_string;
255 let effective_apos = block.apos & !dq_in;
257 let flipped = prefix_xor(effective_apos);
258 let sq_in = if carry_sq { !flipped } else { flipped };
259 carry_sq ^= (effective_apos.count_ones() % 2) == 1;
260
261 block.in_string = dq_in | sq_in;
263
264 block.lt &= !block.in_string;
266 block.gt &= !block.in_string;
267 block.amp &= !block.in_string;
268 block.eq &= !block.in_string;
269 block.slash &= !block.in_string;
270
271 block.apos &= !dq_in;
273 block.quot &= !sq_in;
274 }
275 }
276}
277
278impl Default for StructuralIndexer {
279 fn default() -> Self {
280 Self::new()
281 }
282}
283
284#[inline]
294fn prefix_xor(mut x: u64) -> u64 {
295 x ^= x << 1;
296 x ^= x << 2;
297 x ^= x << 4;
298 x ^= x << 8;
299 x ^= x << 16;
300 x ^= x << 32;
301 x
302}
303
304#[cfg(test)]
305mod tests {
306 use super::*;
307
308 #[test]
313 fn prefix_xor_single_bit() {
314 let result = prefix_xor(1 << 2);
316 for i in 0..64 {
317 let bit = (result >> i) & 1;
318 if i < 2 {
319 assert_eq!(bit, 0, "bit {i} should be 0");
320 } else {
321 assert_eq!(bit, 1, "bit {i} should be 1");
322 }
323 }
324 }
325
326 #[test]
327 fn prefix_xor_two_bits() {
328 let result = prefix_xor((1 << 2) | (1 << 5));
330 for i in 0..64 {
331 let bit = (result >> i) & 1;
332 if (2..5).contains(&i) {
333 assert_eq!(bit, 1, "bit {i} should be 1");
334 } else {
335 assert_eq!(bit, 0, "bit {i} should be 0");
336 }
337 }
338 }
339
340 #[test]
341 fn prefix_xor_zero() {
342 assert_eq!(prefix_xor(0), 0);
343 }
344
345 #[test]
350 fn basic_html_tag() {
351 let input = b"<div>hello</div>";
352 let indexer = StructuralIndexer::new();
353 let index = indexer.index(input);
354
355 let delims: Vec<_> = index.iter_delimiters().collect();
356
357 assert_eq!(
360 delims,
361 vec![
362 DelimiterEntry { pos: 0, byte: b'<' },
363 DelimiterEntry { pos: 4, byte: b'>' },
364 DelimiterEntry {
365 pos: 10,
366 byte: b'<'
367 },
368 DelimiterEntry {
369 pos: 11,
370 byte: b'/'
371 },
372 DelimiterEntry {
373 pos: 15,
374 byte: b'>'
375 },
376 ]
377 );
378 }
379
380 #[test]
381 fn html_with_attributes() {
382 let input = b"<div class=\"foo\">bar</div>";
383 let indexer = StructuralIndexer::new();
384 let index = indexer.index(input);
385
386 let delims: Vec<_> = index.iter_delimiters().collect();
387
388 let bytes: Vec<u8> = delims.iter().map(|d| d.byte).collect();
392 assert!(bytes.contains(&b'<'));
393 assert!(bytes.contains(&b'>'));
394 assert!(bytes.contains(&b'='));
395 assert!(bytes.contains(&b'"'));
396 }
397
398 #[test]
403 fn delimiters_inside_double_quotes_masked() {
404 let input = b"<a title=\"x > y < z\">link</a>";
406 let indexer = StructuralIndexer::new();
407 let index = indexer.index(input);
408
409 let delims: Vec<_> = index.iter_delimiters().collect();
410 let positions: Vec<usize> = delims.iter().map(|d| d.pos).collect();
411
412 assert!(positions.contains(&0));
414 assert!(positions.contains(&20));
416 assert!(positions.contains(&25));
418
419 assert!(!positions.contains(&12), "> inside quotes should be masked");
421 assert!(!positions.contains(&16), "< inside quotes should be masked");
422 }
423
424 #[test]
425 fn delimiters_inside_single_quotes_masked() {
426 let input = b"<a title='x > y'>link</a>";
427 let indexer = StructuralIndexer::new();
428 let index = indexer.index(input);
429
430 let delims: Vec<_> = index.iter_delimiters().collect();
431 let positions: Vec<usize> = delims.iter().map(|d| d.pos).collect();
432
433 assert!(
435 !positions.contains(&12),
436 "> inside single quotes should be masked"
437 );
438 }
439
440 #[test]
441 fn single_quote_inside_double_quotes_ignored() {
442 let input = b"<div title=\"it's\">text</div>";
444 let indexer = StructuralIndexer::new();
445 let index = indexer.index(input);
446
447 let delims: Vec<_> = index.iter_delimiters().collect();
448 let bytes: Vec<u8> = delims.iter().map(|d| d.byte).collect();
449
450 assert!(bytes.contains(&b'<'));
452 assert!(bytes.contains(&b'>'));
453 let gt_positions: Vec<usize> = delims
455 .iter()
456 .filter(|d| d.byte == b'>')
457 .map(|d| d.pos)
458 .collect();
459 assert!(
460 gt_positions.contains(&17),
461 "closing > at 17 should be structural"
462 );
463 }
464
465 #[test]
470 fn empty_input() {
471 let indexer = StructuralIndexer::new();
472 let index = indexer.index(b"");
473
474 assert_eq!(index.input_len(), 0);
475 assert_eq!(index.block_count(), 0);
476 assert_eq!(index.iter_delimiters().count(), 0);
477 assert_eq!(index.estimated_token_count(), 1);
478 }
479
480 #[test]
481 fn text_only_no_delimiters() {
482 let input = b"hello world this is plain text without any tags";
483 let indexer = StructuralIndexer::new();
484 let index = indexer.index(input);
485
486 assert_eq!(index.iter_delimiters().count(), 0);
487 assert_eq!(index.estimated_token_count(), 1);
488 }
489
490 #[test]
491 fn tag_only() {
492 let input = b"<br/>";
493 let indexer = StructuralIndexer::new();
494 let index = indexer.index(input);
495
496 let delims: Vec<_> = index.iter_delimiters().collect();
497 assert_eq!(
498 delims,
499 vec![
500 DelimiterEntry { pos: 0, byte: b'<' },
501 DelimiterEntry { pos: 3, byte: b'/' },
502 DelimiterEntry { pos: 4, byte: b'>' },
503 ]
504 );
505 }
506
507 #[test]
508 fn entity_reference() {
509 let input = b"a & b";
510 let indexer = StructuralIndexer::new();
511 let index = indexer.index(input);
512
513 let amp_entries: Vec<_> = index.iter_delimiters().filter(|d| d.byte == b'&').collect();
514 assert_eq!(amp_entries.len(), 1);
515 assert_eq!(amp_entries[0].pos, 2);
516 }
517
518 #[test]
523 fn long_input_multiple_blocks() {
524 let mut input = Vec::with_capacity(1200);
526 input.extend_from_slice(&[b'x'; 100]);
528 input.extend_from_slice(b"<div>");
530 input.extend_from_slice(&[b'y'; 200]);
532 input.extend_from_slice(b"<span class=\"test\">");
534 input.extend_from_slice(&[b'z'; 700]);
536 input.extend_from_slice(b"</span></div>");
538
539 let indexer = StructuralIndexer::new();
540 let index = indexer.index(&input);
541
542 assert!(input.len() > 1000);
543 assert!(index.block_count() > 15);
544
545 let delims: Vec<_> = index.iter_delimiters().collect();
546
547 assert!(
549 delims.iter().any(|d| d.pos == 100 && d.byte == b'<'),
550 "should find < at offset 100"
551 );
552
553 assert!(
557 delims.iter().any(|d| d.byte == b'='),
558 "should find = in attribute"
559 );
560
561 let last_lt = delims.iter().rev().find(|d| d.byte == b'<');
563 assert!(last_lt.is_some());
564 }
565
566 #[test]
567 fn long_input_with_quotes_spanning_blocks() {
568 let mut input = Vec::new();
570 input.extend_from_slice(b"<div data=\"");
571 while input.len() < 60 {
573 input.push(b'a');
574 }
575 input.push(b'<');
577 while input.len() < 80 {
579 input.push(b'b');
580 }
581 input.extend_from_slice(b"\">end</div>");
583
584 let indexer = StructuralIndexer::new();
585 let index = indexer.index(&input);
586
587 let delims: Vec<_> = index.iter_delimiters().collect();
588 let lt_positions: Vec<usize> = delims
589 .iter()
590 .filter(|d| d.byte == b'<')
591 .map(|d| d.pos)
592 .collect();
593
594 assert!(
596 !lt_positions.contains(&60),
597 "< at offset 60 should be masked (inside quoted string)"
598 );
599
600 assert!(lt_positions.contains(&0), "opening < should be structural");
602 }
603
604 #[test]
609 fn estimated_token_count_basic() {
610 let input = b"<div>hello</div><p>world</p>";
611 let indexer = StructuralIndexer::new();
612 let index = indexer.index(input);
613
614 assert_eq!(index.estimated_token_count(), 5);
616 }
617
618 #[test]
623 fn block_api() {
624 let input = b"<div>text</div>";
625 let indexer = StructuralIndexer::new();
626 let index = indexer.index(input);
627
628 assert_eq!(index.input_len(), 15);
629 assert_eq!(index.block_count(), 1);
630
631 let block = index.block(0);
632 assert_ne!(block.lt, 0, "should have < bits set");
633 assert_ne!(block.gt, 0, "should have > bits set");
634 }
635}