succinctly 0.7.0

High-performance succinct data structures for Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
//! NEON-accelerated string scanning for YAML parsing on ARM64.
//!
//! Uses 128-bit NEON vectors to process 16 bytes at a time.
//!
//! ## Broadword (SWAR) Operations
//!
//! For bulk character classification, we use pure broadword arithmetic instead of
//! NEON intrinsics. This avoids the expensive `neon_movemask` emulation which requires
//! SIMD→scalar lane extractions and multiplication tricks (~10 instructions per call).
//!
//! Broadword operations process 8 bytes at a time using u64 arithmetic:
//! - XOR with broadcast byte to find matches (matching bytes become 0x00)
//! - Use `(x - 0x0101...) & ~x & 0x8080...` trick to detect zero bytes
//! - Extract high bits via shift and mask
//!
//! This approach is ~5x faster than calling `neon_movemask` 8 times for classification.

use core::arch::aarch64::*;

/// Extract a bitmask from the high bit of each byte in a NEON vector.
/// Returns a u16 where bit i is set if byte i has its high bit set.
///
/// Uses the optimized multiplication trick (same as JSON SIMD).
#[inline]
#[target_feature(enable = "neon")]
unsafe fn neon_movemask(v: uint8x16_t) -> u16 {
    // Step 1: Shift right by 7 to get 0 or 1 in each byte
    let high_bits = vshrq_n_u8::<7>(v);

    // Step 2: Extract the 16 bytes as two u64 values
    let low_u64 = vgetq_lane_u64::<0>(vreinterpretq_u64_u8(high_bits));
    let high_u64 = vgetq_lane_u64::<1>(vreinterpretq_u64_u8(high_bits));

    // Step 3: Pack 8 bytes into 8 bits using multiplication trick
    const MAGIC: u64 = 0x0102040810204080;
    let low_packed = (low_u64.wrapping_mul(MAGIC) >> 56) as u8;
    let high_packed = (high_u64.wrapping_mul(MAGIC) >> 56) as u8;

    (low_packed as u16) | ((high_packed as u16) << 8)
}

/// Find the next double-quote or backslash using NEON.
///
/// Returns offset from `start` to the found character, or `None` if not found.
#[inline]
pub fn find_quote_or_escape_neon(input: &[u8], start: usize, end: usize) -> Option<usize> {
    // SAFETY: We check bounds and target_arch = aarch64 guarantees NEON
    unsafe { find_quote_or_escape_neon_impl(input, start, end) }
}

#[target_feature(enable = "neon")]
unsafe fn find_quote_or_escape_neon_impl(input: &[u8], start: usize, end: usize) -> Option<usize> {
    let len = end - start;
    let data = &input[start..end];
    let mut offset = 0;

    // Process 16-byte chunks
    let quote_vec = vdupq_n_u8(b'"');
    let backslash_vec = vdupq_n_u8(b'\\');

    while offset + 16 <= len {
        let chunk = vld1q_u8(data.as_ptr().add(offset));

        // Compare against both targets
        let quotes = vceqq_u8(chunk, quote_vec);
        let backslashes = vceqq_u8(chunk, backslash_vec);

        // OR the results
        let matches = vorrq_u8(quotes, backslashes);

        // Extract bitmask
        let mask = neon_movemask(matches);

        if mask != 0 {
            // Found a match - return position of first match
            return Some(offset + mask.trailing_zeros() as usize);
        }

        offset += 16;
    }

    // Handle remaining bytes with iterator
    data[offset..]
        .iter()
        .position(|&b| b == b'"' || b == b'\\')
        .map(|pos| offset + pos)
}

/// Find the next single-quote using NEON.
///
/// Returns offset from `start` to the found character, or `None` if not found.
#[inline]
pub fn find_single_quote_neon(input: &[u8], start: usize, end: usize) -> Option<usize> {
    // SAFETY: We check bounds and target_arch = aarch64 guarantees NEON
    unsafe { find_single_quote_neon_impl(input, start, end) }
}

#[target_feature(enable = "neon")]
unsafe fn find_single_quote_neon_impl(input: &[u8], start: usize, end: usize) -> Option<usize> {
    let len = end - start;
    let data = &input[start..end];
    let mut offset = 0;

    // Process 16-byte chunks
    let quote_vec = vdupq_n_u8(b'\'');

    while offset + 16 <= len {
        let chunk = vld1q_u8(data.as_ptr().add(offset));

        // Compare against single quote
        let matches = vceqq_u8(chunk, quote_vec);

        // Extract bitmask
        let mask = neon_movemask(matches);

        if mask != 0 {
            // Found a match - return position of first match
            return Some(offset + mask.trailing_zeros() as usize);
        }

        offset += 16;
    }

    // Handle remaining bytes with iterator
    data[offset..]
        .iter()
        .position(|&b| b == b'\'')
        .map(|pos| offset + pos)
}

/// Count leading spaces (indentation) using NEON.
///
/// Returns the number of consecutive space characters starting at `start`.
#[inline]
pub fn count_leading_spaces_neon(input: &[u8], start: usize) -> usize {
    // SAFETY: target_arch = aarch64 guarantees NEON
    unsafe { count_leading_spaces_neon_impl(input, start) }
}

#[target_feature(enable = "neon")]
unsafe fn count_leading_spaces_neon_impl(input: &[u8], start: usize) -> usize {
    let data = &input[start..];
    let len = data.len();
    let mut offset = 0;

    let space_vec = vdupq_n_u8(b' ');

    // Process 16-byte chunks
    while offset + 16 <= len {
        let chunk = vld1q_u8(data.as_ptr().add(offset));

        // Compare against space
        let matches = vceqq_u8(chunk, space_vec);

        // Extract bitmask (1 bit per byte where match occurred)
        let mask = neon_movemask(matches);

        if mask != 0xFFFF {
            // Found a non-space - count trailing ones (consecutive spaces from start)
            // Invert mask: 1s become 0s where spaces were, then count trailing zeros
            return offset + (!mask).trailing_zeros() as usize;
        }

        offset += 16;
    }

    // Handle remaining bytes
    offset + data[offset..].iter().take_while(|&&b| b == b' ').count()
}

// ============================================================================
// Broadword (SWAR) Character Classification
// ============================================================================
//
// These functions use pure u64 arithmetic to classify characters without
// the expensive NEON movemask emulation. Process 8 bytes at a time.

/// Broadcast a byte to all 8 positions in a u64.
#[inline(always)]
const fn broadcast_byte(b: u8) -> u64 {
    0x0101010101010101u64 * (b as u64)
}

/// Constants for broadword zero-byte detection.
const LO_BYTES: u64 = 0x0101010101010101u64;
const HI_BYTES: u64 = 0x8080808080808080u64;

/// Detect which bytes in `x` are zero using the classic broadword trick.
/// Returns a u64 where the high bit of each byte is set if that byte was zero.
///
/// Algorithm: `(x - 0x0101...) & ~x & 0x8080...`
/// - For zero bytes: subtraction causes borrow, setting high bit
/// - For non-zero bytes: either no borrow, or high bit was already set
#[inline(always)]
const fn has_zero_byte(x: u64) -> u64 {
    x.wrapping_sub(LO_BYTES) & !x & HI_BYTES
}

/// Find bytes equal to `target` in `x`.
/// Returns a u64 where the high bit of each byte is set if that byte equals target.
#[inline(always)]
const fn find_byte(x: u64, target: u8) -> u64 {
    has_zero_byte(x ^ broadcast_byte(target))
}

/// Extract a bitmask from the high bits of each byte in a u64.
/// Returns a u8 where bit i is set if byte i has its high bit set.
///
/// Uses multiplication trick: multiply by magic constant to gather bits.
/// After `has_zero_byte`, matching bytes have high bit set at positions 7, 15, 23, ...
/// We shift right by 7 to get bits at positions 0, 8, 16, ...
/// Then multiply by magic to gather them into bits 56-63.
#[inline(always)]
const fn extract_mask_u64(x: u64) -> u8 {
    // Shift high bits to bit 0 of each byte, then pack via multiplication
    // Magic constant: each byte position contributes to a different result bit
    // Bit at pos 0 -> bit 56, pos 8 -> bit 57, ..., pos 56 -> bit 63
    const MAGIC: u64 = 0x0102040810204080u64;
    ((x >> 7).wrapping_mul(MAGIC) >> 56) as u8
}

/// YAML character classification result using broadword operations.
/// Each field is a u8 bitmask for 8 bytes (one bit per byte position).
///
/// NOTE: Currently unused - broadword integration is disabled. See P4 analysis.
#[derive(Debug, Clone, Copy, Default)]
#[allow(dead_code)]
pub struct YamlCharClassBroadword {
    pub newlines: u8,
    pub colons: u8,
    pub hyphens: u8,
    pub spaces: u8,
    pub quotes_double: u8,
    pub quotes_single: u8,
    pub backslashes: u8,
    pub hash: u8,
}

#[allow(dead_code)]
impl YamlCharClassBroadword {
    /// Check if any structural character was found.
    #[inline(always)]
    pub fn has_any(&self) -> bool {
        (self.newlines
            | self.colons
            | self.hyphens
            | self.spaces
            | self.quotes_double
            | self.quotes_single
            | self.backslashes
            | self.hash)
            != 0
    }

    /// Get mask of all value terminators (for unquoted values).
    /// Terminators: newline, colon, space, hash, comma (flow), brackets (flow)
    #[inline(always)]
    pub fn value_terminators(&self) -> u8 {
        self.newlines | self.colons | self.spaces | self.hash
    }
}

/// Classify 8 bytes at once using pure broadword arithmetic.
///
/// This is the core optimization: instead of 8 NEON movemask calls (~80 instructions),
/// we use ~24 arithmetic operations to classify all 8 character types.
///
/// Returns `None` if fewer than 8 bytes remain.
///
/// NOTE: Currently unused - broadword integration is disabled. See P4 analysis.
#[inline]
#[allow(dead_code)]
pub fn classify_yaml_chars_broadword(
    input: &[u8],
    offset: usize,
) -> Option<YamlCharClassBroadword> {
    if offset + 8 > input.len() {
        return None;
    }

    // Load 8 bytes as a u64 (unaligned load is fine on ARM64)
    let chunk = u64::from_le_bytes(input[offset..offset + 8].try_into().unwrap());

    // Find each character type using broadword operations
    let newlines = find_byte(chunk, b'\n');
    let colons = find_byte(chunk, b':');
    let hyphens = find_byte(chunk, b'-');
    let spaces = find_byte(chunk, b' ');
    let quotes_double = find_byte(chunk, b'"');
    let quotes_single = find_byte(chunk, b'\'');
    let backslashes = find_byte(chunk, b'\\');
    let hash = find_byte(chunk, b'#');

    Some(YamlCharClassBroadword {
        newlines: extract_mask_u64(newlines),
        colons: extract_mask_u64(colons),
        hyphens: extract_mask_u64(hyphens),
        spaces: extract_mask_u64(spaces),
        quotes_double: extract_mask_u64(quotes_double),
        quotes_single: extract_mask_u64(quotes_single),
        backslashes: extract_mask_u64(backslashes),
        hash: extract_mask_u64(hash),
    })
}

/// Classify 16 bytes using two broadword operations.
///
/// Returns combined u16 masks (low 8 bits from first chunk, high 8 from second).
/// Returns `None` if fewer than 16 bytes remain.
///
/// NOTE: Currently unused - broadword integration is disabled. See P4 analysis.
#[derive(Debug, Clone, Copy, Default)]
#[allow(dead_code)]
pub struct YamlCharClass16 {
    pub newlines: u16,
    pub colons: u16,
    pub hyphens: u16,
    pub spaces: u16,
    pub quotes_double: u16,
    pub quotes_single: u16,
    pub backslashes: u16,
    pub hash: u16,
}

#[allow(dead_code)]
impl YamlCharClass16 {
    /// Get mask of value terminators.
    #[inline(always)]
    pub fn value_terminators(&self) -> u16 {
        self.newlines | self.colons | self.spaces | self.hash
    }
}

/// Classify 16 bytes at once using two broadword operations.
#[inline]
pub fn classify_yaml_chars_16(input: &[u8], offset: usize) -> Option<YamlCharClass16> {
    if offset + 16 > input.len() {
        return None;
    }

    // Load two 8-byte chunks
    let chunk0 = u64::from_le_bytes(input[offset..offset + 8].try_into().unwrap());
    let chunk1 = u64::from_le_bytes(input[offset + 8..offset + 16].try_into().unwrap());

    // Process both chunks for each character type
    #[inline(always)]
    fn classify_both(c0: u64, c1: u64, target: u8) -> u16 {
        let m0 = extract_mask_u64(find_byte(c0, target)) as u16;
        let m1 = extract_mask_u64(find_byte(c1, target)) as u16;
        m0 | (m1 << 8)
    }

    Some(YamlCharClass16 {
        newlines: classify_both(chunk0, chunk1, b'\n'),
        colons: classify_both(chunk0, chunk1, b':'),
        hyphens: classify_both(chunk0, chunk1, b'-'),
        spaces: classify_both(chunk0, chunk1, b' '),
        quotes_double: classify_both(chunk0, chunk1, b'"'),
        quotes_single: classify_both(chunk0, chunk1, b'\''),
        backslashes: classify_both(chunk0, chunk1, b'\\'),
        hash: classify_both(chunk0, chunk1, b'#'),
    })
}

/// Find the next newline using broadword operations.
///
/// Returns offset from `start` to the newline, or `None` if not found.
#[inline]
pub fn find_newline_broadword(input: &[u8], start: usize) -> Option<usize> {
    let data = &input[start..];
    let len = data.len();
    let mut offset = 0;

    // Process 8 bytes at a time
    while offset + 8 <= len {
        let chunk = u64::from_le_bytes(data[offset..offset + 8].try_into().unwrap());
        let matches = find_byte(chunk, b'\n');

        if matches != 0 {
            // Found a newline - return position of first match
            // Each matching byte has its high bit set, so count trailing zeros / 8
            return Some(offset + (matches.trailing_zeros() / 8) as usize);
        }

        offset += 8;
    }

    // Handle remaining bytes
    data[offset..]
        .iter()
        .position(|&b| b == b'\n')
        .map(|pos| offset + pos)
}

// ============================================================================
// P4 Optimization: Anchor/Alias SIMD Parsing
// ============================================================================

/// Parse anchor/alias name using NEON SIMD to find terminator characters.
///
/// Searches for YAML anchor name terminators:
/// - Whitespace: space, tab, newline, CR
/// - Flow indicators: [ ] { } ,
/// - Colons (terminates anchor names)
///
/// Returns the position of the first terminator, or end of input.
#[inline]
pub fn parse_anchor_name_neon(input: &[u8], start: usize) -> usize {
    if start >= input.len() {
        return start;
    }

    // Use NEON for 16+ bytes
    if start + 16 <= input.len() {
        // SAFETY: NEON is mandatory on aarch64
        unsafe { parse_anchor_name_neon_impl(input, start) }
    } else {
        parse_anchor_name_scalar(input, start)
    }
}

#[target_feature(enable = "neon")]
unsafe fn parse_anchor_name_neon_impl(input: &[u8], start: usize) -> usize {
    let len = input.len();
    let mut pos = start;

    // Create comparison vectors for terminator characters
    // Note: We don't include colon here because it's only a terminator
    // if followed by whitespace. We'll check that in the scalar fallback.
    let space = vdupq_n_u8(b' ');
    let tab = vdupq_n_u8(b'\t');
    let newline = vdupq_n_u8(b'\n');
    let cr = vdupq_n_u8(b'\r');
    let lbracket = vdupq_n_u8(b'[');
    let rbracket = vdupq_n_u8(b']');
    let lbrace = vdupq_n_u8(b'{');
    let rbrace = vdupq_n_u8(b'}');
    let comma = vdupq_n_u8(b',');
    let colon = vdupq_n_u8(b':');

    // Process 16 bytes at a time
    while pos + 16 <= len {
        let chunk = vld1q_u8(input.as_ptr().add(pos));

        // Check for all terminator types (except colon which needs special handling)
        let is_space = vceqq_u8(chunk, space);
        let is_tab = vceqq_u8(chunk, tab);
        let is_newline = vceqq_u8(chunk, newline);
        let is_cr = vceqq_u8(chunk, cr);
        let is_lbracket = vceqq_u8(chunk, lbracket);
        let is_rbracket = vceqq_u8(chunk, rbracket);
        let is_lbrace = vceqq_u8(chunk, lbrace);
        let is_rbrace = vceqq_u8(chunk, rbrace);
        let is_comma = vceqq_u8(chunk, comma);
        let is_colon = vceqq_u8(chunk, colon);

        // Combine all terminator checks (whitespace and flow indicators are definite terminators)
        let ws = vorrq_u8(is_space, is_tab);
        let ws = vorrq_u8(ws, is_newline);
        let ws = vorrq_u8(ws, is_cr);

        let flow = vorrq_u8(is_lbracket, is_rbracket);
        let flow = vorrq_u8(flow, is_lbrace);
        let flow = vorrq_u8(flow, is_rbrace);
        let flow = vorrq_u8(flow, is_comma);

        let definite_terminators = vorrq_u8(ws, flow);

        // Check for definite terminators first
        let definite_mask = neon_movemask(definite_terminators);
        let colon_mask = neon_movemask(is_colon);

        if definite_mask != 0 || colon_mask != 0 {
            // Found potential terminator - need to check each position
            let combined_mask = definite_mask | colon_mask;
            let first_pos = combined_mask.trailing_zeros() as usize;

            // If it's a definite terminator, return immediately
            if (definite_mask >> first_pos) & 1 != 0 {
                return pos + first_pos;
            }

            // It's a colon - check if followed by whitespace
            let colon_pos = pos + first_pos;
            if colon_pos + 1 < len {
                let next = input[colon_pos + 1];
                if next == b' ' || next == b'\t' || next == b'\n' || next == b'\r' {
                    return colon_pos;
                }
            }

            // Colon not followed by whitespace - continue scanning from colon_pos + 1
            // Use scalar to handle the complex colon logic correctly
            return parse_anchor_name_scalar(input, colon_pos + 1);
        }

        pos += 16;
    }

    // Handle remaining bytes with scalar fallback
    parse_anchor_name_scalar(input, pos)
}

/// Scalar fallback for parsing anchor names.
fn parse_anchor_name_scalar(input: &[u8], start: usize) -> usize {
    let mut pos = start;
    while pos < input.len() {
        let b = input[pos];
        match b {
            // Stop at flow indicators, whitespace, and newlines
            b' ' | b'\t' | b'\n' | b'\r' | b'[' | b']' | b'{' | b'}' | b',' => break,
            // Colon is allowed in anchor names if not followed by whitespace
            b':' => {
                if pos + 1 < input.len() {
                    let next = input[pos + 1];
                    if next == b' ' || next == b'\t' || next == b'\n' || next == b'\r' {
                        break;
                    }
                }
                pos += 1;
            }
            _ => pos += 1,
        }
    }
    pos
}

// ============================================================================
// Issue #87: JSON Escape Scanning for Streaming Output
// ============================================================================

/// Find the next JSON escapable character using NEON.
///
/// Searches for characters that need escaping in JSON strings:
/// - Double quote (`"`)
/// - Backslash (`\`)
/// - Control characters (bytes < 0x20)
///
/// Returns the offset from `start` to the found character, or the length
/// of the slice if no escapable character is found.
///
/// This is used for fast-path scanning in `write_json_string` during
/// YAML→JSON streaming output.
#[inline(always)]
pub fn find_json_escape_neon(bytes: &[u8], start: usize) -> usize {
    if start >= bytes.len() {
        return bytes.len();
    }

    // Use NEON for 16+ bytes, otherwise scalar is faster
    if bytes.len() - start >= 16 {
        // SAFETY: NEON is mandatory on aarch64
        unsafe { find_json_escape_neon_impl(bytes, start) }
    } else {
        find_json_escape_scalar(bytes, start)
    }
}

/// Scalar implementation for short strings
#[inline(always)]
fn find_json_escape_scalar(bytes: &[u8], start: usize) -> usize {
    for (i, &b) in bytes[start..].iter().enumerate() {
        if b == b'"' || b == b'\\' || b < 0x20 {
            return start + i;
        }
    }
    bytes.len()
}

#[target_feature(enable = "neon")]
unsafe fn find_json_escape_neon_impl(bytes: &[u8], start: usize) -> usize {
    let len = bytes.len();
    let data = &bytes[start..];
    let data_len = data.len();
    let mut offset = 0;

    // Create comparison vectors
    let quote_vec = vdupq_n_u8(b'"');
    let backslash_vec = vdupq_n_u8(b'\\');
    let control_threshold = vdupq_n_u8(0x20);

    // Process 16-byte chunks
    while offset + 16 <= data_len {
        let chunk = vld1q_u8(data.as_ptr().add(offset));

        // Check for quote and backslash
        let quotes = vceqq_u8(chunk, quote_vec);
        let backslashes = vceqq_u8(chunk, backslash_vec);

        // Check for control characters (bytes < 0x20)
        let controls = vcltq_u8(chunk, control_threshold);

        // Combine all matches
        let matches = vorrq_u8(vorrq_u8(quotes, backslashes), controls);

        // Extract bitmask
        let mask = neon_movemask(matches);

        if mask != 0 {
            // Found a match - return absolute position
            return start + offset + mask.trailing_zeros() as usize;
        }

        offset += 16;
    }

    // Handle remaining bytes with scalar fallback
    for i in offset..data_len {
        let b = data[i];
        if b == b'"' || b == b'\\' || b < 0x20 {
            return start + i;
        }
    }

    // No escapable character found
    len
}

// ============================================================================
// P2.7 Optimization: Block Scalar SIMD Parsing
// ============================================================================

/// Find the end of a block scalar using NEON SIMD.
///
/// Scans for newlines and checks indentation on each line.
/// Returns the position where the block ends (start of line with insufficient indent),
/// or input.len() if EOF is reached.
#[inline]
pub fn find_block_scalar_end_neon(input: &[u8], start: usize, min_indent: usize) -> usize {
    if start >= input.len() {
        return input.len();
    }

    // SAFETY: NEON is mandatory on aarch64
    unsafe { find_block_scalar_end_neon_impl(input, start, min_indent) }
}

#[target_feature(enable = "neon")]
unsafe fn find_block_scalar_end_neon_impl(input: &[u8], start: usize, min_indent: usize) -> usize {
    let newline_vec = vdupq_n_u8(b'\n');
    let space_vec = vdupq_n_u8(b' ');

    let mut pos = start;

    // Process in 16-byte chunks, looking for newlines
    while pos + 16 < input.len() {
        let chunk = vld1q_u8(input.as_ptr().add(pos));
        let nl_matches = vceqq_u8(chunk, newline_vec);
        let mut nl_mask = neon_movemask(nl_matches);

        if nl_mask != 0 {
            // Found newline(s) in this chunk - check indentation after each
            while nl_mask != 0 {
                let offset = nl_mask.trailing_zeros() as usize;
                let line_start = pos + offset + 1; // Position after newline

                if line_start >= input.len() {
                    return input.len(); // EOF
                }

                // Count leading spaces on next line
                let mut indent = 0;
                let remaining = input.len() - line_start;

                // Use SIMD to count spaces if we have 16+ bytes
                if remaining >= 16 {
                    let next_chunk = vld1q_u8(input.as_ptr().add(line_start));
                    let space_matches = vceqq_u8(next_chunk, space_vec);
                    let space_mask = neon_movemask(space_matches);

                    if space_mask != 0xFFFF {
                        indent = (!space_mask).trailing_zeros() as usize;
                    } else {
                        indent = 16;
                        // Continue counting if all 16 were spaces
                        let mut check_pos = line_start + 16;
                        while check_pos < input.len() && input[check_pos] == b' ' {
                            indent += 1;
                            check_pos += 1;
                        }
                    }
                } else {
                    // Less than 16 bytes remaining, count scalar
                    while line_start + indent < input.len() && input[line_start + indent] == b' ' {
                        indent += 1;
                    }
                }

                // Check if this line has sufficient indent
                if line_start + indent < input.len() {
                    let next_char = input[line_start + indent];
                    if next_char != b'\n' && next_char != b'\r' && indent < min_indent {
                        // Content at insufficient indent - block ends here
                        return line_start;
                    }
                }

                // Clear this bit and check next newline
                nl_mask &= nl_mask - 1;
            }
        }

        pos += 16;
    }

    // Handle remainder with scalar code
    find_block_scalar_end_scalar(input, pos, min_indent)
}

/// Scalar fallback for find_block_scalar_end.
fn find_block_scalar_end_scalar(input: &[u8], start: usize, min_indent: usize) -> usize {
    let mut pos = start;

    while pos < input.len() {
        if input[pos] == b'\n' {
            let line_start = pos + 1;

            if line_start >= input.len() {
                return input.len();
            }

            // Count leading spaces
            let mut indent = 0;
            while line_start + indent < input.len() && input[line_start + indent] == b' ' {
                indent += 1;
            }

            // Check if this line has content at insufficient indent
            if line_start + indent < input.len() {
                let next_char = input[line_start + indent];
                if next_char != b'\n' && next_char != b'\r' && indent < min_indent {
                    return line_start;
                }
            }
        }
        pos += 1;
    }

    input.len()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_neon_find_quote_basic() {
        let input = b"hello\"world";
        assert_eq!(find_quote_or_escape_neon(input, 0, input.len()), Some(5));
    }

    #[test]
    fn test_neon_find_backslash() {
        let input = b"hello\\world";
        assert_eq!(find_quote_or_escape_neon(input, 0, input.len()), Some(5));
    }

    #[test]
    fn test_neon_find_single_quote() {
        let input = b"hello'world";
        assert_eq!(find_single_quote_neon(input, 0, input.len()), Some(5));
    }

    #[test]
    fn test_neon_long_string() {
        // Test with > 16 bytes
        let mut input = vec![b'a'; 100];
        input[50] = b'"';
        assert_eq!(find_quote_or_escape_neon(&input, 0, input.len()), Some(50));
    }

    #[test]
    fn test_neon_at_chunk_boundary() {
        // Quote at exactly byte 16 (second chunk)
        let mut input = vec![b'a'; 32];
        input[16] = b'"';
        assert_eq!(find_quote_or_escape_neon(&input, 0, input.len()), Some(16));
    }

    #[test]
    fn test_neon_in_remainder() {
        // Quote in the remainder bytes (< 16)
        let mut input = vec![b'a'; 20];
        input[18] = b'"';
        assert_eq!(find_quote_or_escape_neon(&input, 0, input.len()), Some(18));
    }

    #[test]
    fn test_neon_count_leading_spaces_basic() {
        assert_eq!(count_leading_spaces_neon(b"  hello", 0), 2);
        assert_eq!(count_leading_spaces_neon(b"    world", 0), 4);
        assert_eq!(count_leading_spaces_neon(b"no spaces", 0), 0);
    }

    #[test]
    fn test_neon_count_leading_spaces_long() {
        // Test with > 16 bytes to exercise SIMD path
        let mut input = vec![b' '; 50];
        input.extend_from_slice(b"content");
        assert_eq!(count_leading_spaces_neon(&input, 0), 50);
    }

    #[test]
    fn test_neon_count_leading_spaces_at_boundary() {
        // Spaces ending exactly at 16-byte boundary
        let mut input = vec![b' '; 16];
        input.push(b'x');
        assert_eq!(count_leading_spaces_neon(&input, 0), 16);

        // Spaces ending at 32-byte boundary
        let mut input32 = vec![b' '; 32];
        input32.push(b'x');
        assert_eq!(count_leading_spaces_neon(&input32, 0), 32);
    }

    #[test]
    fn test_neon_count_leading_spaces_in_remainder() {
        // Non-space in remainder bytes (< 16)
        let mut input = vec![b' '; 20];
        input.push(b'x');
        assert_eq!(count_leading_spaces_neon(&input, 0), 20);
    }

    // ========================================================================
    // Broadword tests
    // ========================================================================

    #[test]
    fn test_broadword_find_byte_basic() {
        // Test the core find_byte function
        let data = b"hello:world";
        let chunk = u64::from_le_bytes(data[0..8].try_into().unwrap());
        let colon_mask = find_byte(chunk, b':');
        // Colon is at position 5, so bit 5*8+7 = 47 should be set
        assert_ne!(colon_mask, 0);
        assert_eq!(colon_mask.trailing_zeros() / 8, 5);
    }

    #[test]
    fn test_broadword_classify_basic() {
        let input = b"key: value\n";
        let class = classify_yaml_chars_broadword(input, 0).unwrap();

        // Check specific character positions
        assert_eq!(class.colons, 0b00001000); // colon at position 3
        assert_eq!(class.spaces, 0b00010000); // space at position 4
    }

    #[test]
    fn test_broadword_classify_multiple() {
        let input = b": - # \"\n\\";
        let class = classify_yaml_chars_broadword(input, 0).unwrap();

        assert_ne!(class.colons, 0);
        assert_ne!(class.hyphens, 0);
        assert_ne!(class.hash, 0);
        assert_ne!(class.quotes_double, 0);
        assert_ne!(class.newlines, 0);
    }

    #[test]
    fn test_broadword_classify_16_basic() {
        let input = b"0123456789abcdef";
        let class = classify_yaml_chars_16(input, 0).unwrap();

        // No special characters
        assert_eq!(class.colons, 0);
        assert_eq!(class.newlines, 0);
    }

    #[test]
    fn test_broadword_classify_16_with_matches() {
        let input = b"key: val\nmore: x\n";
        let class = classify_yaml_chars_16(input, 0).unwrap();

        // Colon at position 3 and 13
        assert!(class.colons & (1 << 3) != 0);
        assert!(class.colons & (1 << 13) != 0);

        // Newline at position 8
        assert!(class.newlines & (1 << 8) != 0);
    }

    #[test]
    fn test_broadword_find_newline() {
        let input = b"hello\nworld";
        assert_eq!(find_newline_broadword(input, 0), Some(5));

        let input2 = b"no newline here";
        assert_eq!(find_newline_broadword(input2, 0), None);

        // Long input
        let mut long = vec![b'a'; 100];
        long[50] = b'\n';
        assert_eq!(find_newline_broadword(&long, 0), Some(50));
    }

    #[test]
    fn test_broadword_find_newline_in_remainder() {
        // Newline in bytes after last 8-byte chunk
        let mut input = vec![b'a'; 10];
        input[9] = b'\n';
        assert_eq!(find_newline_broadword(&input, 0), Some(9));
    }

    #[test]
    fn test_broadword_value_terminators() {
        let input = b"value: x";
        let class = classify_yaml_chars_broadword(input, 0).unwrap();

        let terminators = class.value_terminators();
        // Should have colon at 5 and space at 6
        assert!(terminators & (1 << 5) != 0);
        assert!(terminators & (1 << 6) != 0);
    }

    // ========================================================================
    // P4: Anchor/Alias NEON tests
    // ========================================================================

    #[test]
    fn test_parse_anchor_name_basic() {
        // Simple anchor name terminated by space
        assert_eq!(parse_anchor_name_neon(b"anchor_name value", 0), 11);

        // Colon NOT followed by whitespace - NOT a terminator (colon allowed in anchor names)
        assert_eq!(parse_anchor_name_neon(b"anchor:value", 0), 12);

        // Colon followed by space - IS a terminator
        assert_eq!(parse_anchor_name_neon(b"anchor: value", 0), 6);

        // Colon followed by newline - IS a terminator
        assert_eq!(parse_anchor_name_neon(b"anchor:\nvalue", 0), 6);

        // Terminated by newline
        assert_eq!(parse_anchor_name_neon(b"anchor\nvalue", 0), 6);

        // Terminated by tab
        assert_eq!(parse_anchor_name_neon(b"anchor\tvalue", 0), 6);
    }

    #[test]
    fn test_parse_anchor_name_flow_indicators() {
        // Terminated by flow indicators
        assert_eq!(parse_anchor_name_neon(b"anchor[0]", 0), 6);
        assert_eq!(parse_anchor_name_neon(b"anchor]end", 0), 6);
        assert_eq!(parse_anchor_name_neon(b"anchor{key}", 0), 6);
        assert_eq!(parse_anchor_name_neon(b"anchor}end", 0), 6);
        assert_eq!(parse_anchor_name_neon(b"anchor,next", 0), 6);
    }

    #[test]
    fn test_parse_anchor_name_long() {
        // Long anchor name (>16 bytes to exercise SIMD path)
        let mut input = vec![b'a'; 50];
        input.push(b' ');
        input.extend_from_slice(b"value");
        assert_eq!(parse_anchor_name_neon(&input, 0), 50);
    }

    #[test]
    fn test_parse_anchor_name_no_terminator() {
        // No terminator - should return end of input
        assert_eq!(parse_anchor_name_neon(b"anchor_name", 0), 11);
    }

    #[test]
    fn test_parse_anchor_name_with_offset() {
        // Start from offset
        assert_eq!(parse_anchor_name_neon(b"&anchor_name value", 1), 12);
    }

    // ========================================================================
    // P2.7: Block Scalar NEON tests
    // ========================================================================

    #[test]
    fn test_find_block_scalar_end_basic() {
        // Block scalar with proper indentation
        let input = b"|\n  line1\n  line2\nnext_key:";
        // min_indent=2, so "next_key:" (indent=0) should terminate
        let result = find_block_scalar_end_neon(input, 2, 2);
        assert_eq!(result, 18); // Position of 'n' in "next_key"
    }

    #[test]
    fn test_find_block_scalar_end_eof() {
        // Block scalar that ends at EOF
        let input = b"|\n  line1\n  line2";
        let result = find_block_scalar_end_neon(input, 2, 2);
        assert_eq!(result, input.len());
    }

    #[test]
    fn test_find_block_scalar_end_long() {
        // Long block scalar (>16 bytes per line to exercise SIMD path)
        let mut input = b"|\n".to_vec();
        for _ in 0..5 {
            input.extend_from_slice(b"  ");
            input.extend_from_slice(&[b'x'; 20]);
            input.push(b'\n');
        }
        input.extend_from_slice(b"next:");

        let result = find_block_scalar_end_neon(&input, 2, 2);
        // Should find "next:" at the end
        assert_eq!(result, input.len() - 5);
    }

    #[test]
    fn test_find_block_scalar_end_empty_lines() {
        // Empty lines should be ignored
        let input = b"|\n  line1\n\n  line2\nnext:";
        let result = find_block_scalar_end_neon(input, 2, 2);
        assert_eq!(result, 19); // Position of 'n' in "next:"
    }

    #[test]
    fn test_find_block_scalar_matches_scalar() {
        // Compare NEON vs scalar for various inputs
        let test_cases: &[(&[u8], usize)] = &[
            (b"|\n  line1\n  line2\nnext:", 2),
            (b"|\n    deep\n    indent\nshallow:", 4),
            (b"|\n  a\n  b\n  c\n", 2),
        ];

        for &(input, min_indent) in test_cases {
            let neon_result = find_block_scalar_end_neon(input, 2, min_indent);
            let scalar_result = find_block_scalar_end_scalar(input, 2, min_indent);
            assert_eq!(
                neon_result,
                scalar_result,
                "Mismatch for input {:?} with min_indent={}",
                String::from_utf8_lossy(input),
                min_indent
            );
        }
    }

    // ========================================================================
    // Issue #87: JSON Escape Scanning NEON tests
    // ========================================================================

    #[test]
    fn test_find_json_escape_quote() {
        let input = b"hello\"world";
        assert_eq!(find_json_escape_neon(input, 0), 5);
    }

    #[test]
    fn test_find_json_escape_backslash() {
        let input = b"hello\\world";
        assert_eq!(find_json_escape_neon(input, 0), 5);
    }

    #[test]
    fn test_find_json_escape_control_char() {
        // Tab is 0x09 (< 0x20)
        let input = b"hello\tworld";
        assert_eq!(find_json_escape_neon(input, 0), 5);

        // Newline is 0x0A
        let input2 = b"hello\nworld";
        assert_eq!(find_json_escape_neon(input2, 0), 5);

        // Null byte
        let input3 = b"hello\x00world";
        assert_eq!(find_json_escape_neon(input3, 0), 5);
    }

    #[test]
    fn test_find_json_escape_no_escape() {
        let input = b"hello world";
        assert_eq!(find_json_escape_neon(input, 0), input.len());
    }

    #[test]
    fn test_find_json_escape_long_string() {
        // Test with > 16 bytes to exercise SIMD path
        let mut input = vec![b'a'; 100];
        input[50] = b'"';
        assert_eq!(find_json_escape_neon(&input, 0), 50);
    }

    #[test]
    fn test_find_json_escape_at_chunk_boundary() {
        // Escape at exactly byte 16 (second chunk)
        let mut input = vec![b'a'; 32];
        input[16] = b'"';
        assert_eq!(find_json_escape_neon(&input, 0), 16);
    }

    #[test]
    fn test_find_json_escape_in_remainder() {
        // Escape in the remainder bytes (< 16)
        let mut input = vec![b'a'; 20];
        input[18] = b'\\';
        assert_eq!(find_json_escape_neon(&input, 0), 18);
    }

    #[test]
    fn test_find_json_escape_with_offset() {
        let input = b"abc\"def\"ghi";
        assert_eq!(find_json_escape_neon(input, 0), 3);
        assert_eq!(find_json_escape_neon(input, 4), 7);
    }

    #[test]
    fn test_find_json_escape_control_chars_throughout() {
        // Test various control chars at different positions
        for ctrl in 0u8..0x20 {
            let mut input = vec![b'x'; 50];
            input[25] = ctrl;
            assert_eq!(
                find_json_escape_neon(&input, 0),
                25,
                "Failed for control char 0x{:02x}",
                ctrl
            );
        }
    }

    #[test]
    fn test_find_json_escape_empty() {
        assert_eq!(find_json_escape_neon(b"", 0), 0);
    }

    #[test]
    fn test_find_json_escape_start_past_end() {
        let input = b"hello";
        assert_eq!(find_json_escape_neon(input, 10), input.len());
    }

    /// Scalar reference implementation for comparison testing
    fn find_json_escape_scalar(bytes: &[u8], start: usize) -> usize {
        for (i, &b) in bytes[start..].iter().enumerate() {
            if b == b'"' || b == b'\\' || b < 0x20 {
                return start + i;
            }
        }
        bytes.len()
    }

    #[test]
    fn test_find_json_escape_matches_scalar() {
        let test_cases: &[&[u8]] = &[
            b"",
            b"\"",
            b"\\",
            b"\t",
            b"\n",
            b"\r",
            b"\x00",
            b"no escape chars here",
            b"escape at end\"",
            b"\"escape at start",
            b"has\\backslash",
            b"has\ttab",
            b"has\nnewline",
            b"multiple \"escapes\" here\\",
            // Long strings
            &[b'x'; 100],
        ];

        for &input in test_cases {
            let scalar = find_json_escape_scalar(input, 0);
            let neon = find_json_escape_neon(input, 0);
            assert_eq!(
                scalar,
                neon,
                "Mismatch for {:?}: scalar={}, neon={}",
                String::from_utf8_lossy(input),
                scalar,
                neon
            );
        }

        // Test with various offsets
        let input = b"abc\"def\\ghi\tjkl";
        for start in 0..input.len() {
            let scalar = find_json_escape_scalar(input, start);
            let neon = find_json_escape_neon(input, start);
            assert_eq!(
                scalar, neon,
                "Mismatch at offset {}: scalar={}, neon={}",
                start, scalar, neon
            );
        }
    }
}