relon-unicode 0.1.0-rc2

Leaf Unicode tables, case-folding / normalization algorithms, and the glob matcher shared across Relon crates.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
// Module-local opt-in: the wasm32 SIMD intrinsics (`v128_load` /
// `v128_store`) used in the `+simd128` arm are `unsafe fn` in
// `core::arch::wasm32`. Every `unsafe` block in this module has a
// SAFETY comment justifying it; the crate-wide policy stays `deny`.
#![allow(unsafe_code)]

//! v3++ item 4 — SIMD ASCII fast-path for case folding.
//!
//! Tree-walk `fold_string` (in `relon-evaluator`) and the IR-level
//! `case_fold_body_inner` body share the same UAX #21 semantics: full
//! multi-cp mappings, Greek final-sigma context, Turkish locale
//! overrides, and combining-mark passthrough. For the pure-ASCII
//! fast majority of real-world configs (`String.upper()` over
//! identifiers, environment names, English filenames, etc.) the
//! per-codepoint UTF-8 decode + per-cp table lookup is wasted work:
//! `b'A'..=b'Z'` (0x41..=0x5A) deterministically maps to `b'a'..=b'z'`
//! via `b | 0x20` and the reverse via `b & 0xDF`, byte-identical with
//! what the full path would produce on those bytes.
//!
//! This module exposes a single helper that the slow path calls
//! before entering its codepoint loop:
//!
//! * [`scan_ascii_prefix`] — finds the first non-ASCII byte (`>= 0x80`)
//!   in the input. Returns `input.len()` if the whole string is ASCII.
//!   The implementation prefers wasm32 v128 lane-mask when compiled
//!   with `+simd128`; otherwise it falls back to an 8-byte chunked
//!   scalar scan that LLVM auto-vectorises on x86_64 + aarch64.
//!
//! * [`fold_ascii_prefix`] — given an ASCII prefix `&[u8]` and a
//!   [`AsciiFoldMode`], writes the case-folded bytes into a caller-
//!   supplied `Vec<u8>`. Upper / Lower run in straight SIMD masks
//!   (no branches per byte). Title walks the prefix byte-by-byte to
//!   track word boundaries — title casing in ASCII reduces to: at
//!   each word start, uppercase the byte; otherwise lowercase.
//!
//! ## Why this lives in `relon-ir`
//!
//! The IR crate is the cross-cutting dependency reached by both the
//! tree-walk evaluator and the cranelift-AOT codegen. Tree-walk calls
//! this helper directly (the only consumer wired up in v3++ item 4).
//! cranelift-AOT keeps emitting its IR body unchanged — embedding
//! SIMD intrinsics into the IR Op surface would require new
//! IR Op variants + a backend-specific emit, which is out of scope
//! for the perf-only item 4. The pure-Rust helper still benefits
//! native consumers via the regular Rust ABI.
//!
//! ## Why no third-party SIMD crate
//!
//! `wide`, `packed_simd`, `simdeez`, etc. would each pull a new
//! dependency through `relon-ir`, which is the dep root for half the
//! workspace. The wasm32 + native shapes we need are tiny (one mask,
//! one compare-and-add) and the standard library's `core::arch`
//! intrinsics for wasm32 are safe + stable. For x86_64 / aarch64 we
//! rely on the LLVM autovectoriser: the scalar loops written here
//! compile to `pshufb` + `por` style sequences on x86_64-v3 and
//! `tbl` + `orr` on aarch64-neon, which is good enough for the
//! 1 KB / 10 KB ASCII throughput target.
//!
//! ## Byte-identical guarantee
//!
//! For any input string `s` that is fully ASCII and any non-Turkish
//! mode, `fold_ascii_prefix(s.as_bytes(), mode, ...)` produces output
//! bytes byte-identical with what the slow `fold_string` path would
//! emit — Turkish is explicitly excluded by callers because
//! `I -> ı` / `i -> İ` would escape to 2-byte UTF-8 outputs. The
//! unit tests in this module assert byte identity against the
//! tree-walk path for randomised ASCII corpora across all three
//! modes; the workspace `three_way_corpus` `stdlib_case_fold` tier
//! continues to assert byte identity across the full UAX #21
//! corpus (8/8 all_agree before and after this change).

/// Fold mode dispatch for the ASCII fast-path. Mirrors the private
/// `CaseFoldMode` in `relon-evaluator`; lifted into `relon-ir` so the
/// evaluator can call into this module without leaking its internal
/// enum upward.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AsciiFoldMode {
    Upper,
    Lower,
    Title,
}

/// Result of the ASCII fast-path. `consumed` is the number of input
/// bytes (and codepoints — they're equal in ASCII) that the fast path
/// wrote to `out`. `at_word_start` carries the Title-mode word
/// boundary state forward for the slow path to continue.
#[derive(Debug, Clone, Copy)]
pub struct AsciiFastResult {
    pub consumed: usize,
    pub at_word_start: bool,
}

/// Locate the first non-ASCII byte (`>= 0x80`) in `bytes`. Returns
/// `bytes.len()` if the entire slice is ASCII.
///
/// Implementation strategy:
///
/// 1. **wasm32 + simd128** — `v128` chunks of 16, lane compare against
///    splat(0x80) with the *signed* `i8x16` comparator (which treats
///    `>= 0x80` as negative), bitmask to a `u16`, trailing zeros
///    finds the byte index.
/// 2. **everywhere else** — 8-byte chunks via `u64::from_le_bytes`,
///    `(x & 0x8080_8080_8080_8080) != 0` finds the first high-bit
///    byte; `trailing_zeros / 8` gives the byte index. LLVM lifts the
///    scalar loop to SSE2 / NEON on the respective targets.
#[inline]
pub fn scan_ascii_prefix(bytes: &[u8]) -> usize {
    scan_ascii_prefix_impl(bytes)
}

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[inline]
fn scan_ascii_prefix_impl(bytes: &[u8]) -> usize {
    use core::arch::wasm32::*;
    let len = bytes.len();
    let mut i = 0usize;
    // v128 load is unaligned-safe via `v128_load`. We process 16-byte
    // chunks; the tail (< 16 bytes) falls into the scalar loop below.
    while i + 16 <= len {
        // SAFETY: `i + 16 <= len` is the loop invariant so the 16
        // bytes starting at `bytes.as_ptr().add(i)` are entirely
        // within the slice. `v128_load` is documented to perform an
        // unaligned read (wasm v128 loads have alignment hint 0 by
        // default), so no alignment requirement applies. The cast
        // from `*const u8` to `*const v128` only changes the load
        // width; lifetime is anchored by `bytes`.
        let chunk = unsafe { v128_load(bytes.as_ptr().add(i) as *const v128) };
        // `i8x16_bitmask` returns the high-bit of each lane as a
        // 16-bit mask. ASCII bytes have the high bit clear, non-ASCII
        // bytes have it set, so the mask is exactly what we want.
        let mask = i8x16_bitmask(chunk);
        if mask != 0 {
            return i + mask.trailing_zeros() as usize;
        }
        i += 16;
    }
    // Tail.
    while i < len {
        if bytes[i] >= 0x80 {
            return i;
        }
        i += 1;
    }
    len
}

#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
#[inline]
fn scan_ascii_prefix_impl(bytes: &[u8]) -> usize {
    let len = bytes.len();
    let mut i = 0usize;
    // 8-byte chunked scan. `u64::from_le_bytes` over a fresh `[u8;
    // 8]` lets LLVM hoist this into a single 8-byte load + AND on
    // every reasonable target, and the autovectoriser unrolls /
    // widens it to SSE2 / NEON on x86_64-v3 / aarch64-neon.
    const HIGH_BITS: u64 = 0x8080_8080_8080_8080;
    while i + 8 <= len {
        let mut buf = [0u8; 8];
        buf.copy_from_slice(&bytes[i..i + 8]);
        let chunk = u64::from_le_bytes(buf);
        let hits = chunk & HIGH_BITS;
        if hits != 0 {
            // `trailing_zeros / 8` finds the byte position of the
            // lowest high-bit byte. Works for little-endian word
            // load (which `from_le_bytes` guarantees).
            return i + (hits.trailing_zeros() as usize) / 8;
        }
        i += 8;
    }
    while i < len {
        if bytes[i] >= 0x80 {
            return i;
        }
        i += 1;
    }
    len
}

/// SIMD ASCII case-fold for `Upper` / `Lower` mode. Writes
/// `prefix.len()` bytes into `out`. Caller has already verified the
/// prefix is fully ASCII (every byte `< 0x80`) and selected a
/// non-Turkish mode.
#[inline]
fn fold_ascii_prefix_upper_lower(prefix: &[u8], upper: bool, out: &mut Vec<u8>) {
    out.reserve(prefix.len());
    fold_ascii_upper_lower_impl(prefix, upper, out);
}

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[inline]
fn fold_ascii_upper_lower_impl(prefix: &[u8], upper: bool, out: &mut Vec<u8>) {
    use core::arch::wasm32::*;
    let len = prefix.len();
    let start = out.len();
    // Reserve the exact final size and write through `Vec`'s spare
    // capacity. Avoids `push` per byte.
    out.resize(start + len, 0);
    let dst = &mut out[start..start + len];

    // upper(b) = b in [0x61..=0x7A] ? b ^ 0x20 : b
    // lower(b) = b in [0x41..=0x5A] ? b ^ 0x20 : b
    //
    // We compute `in_range = (b >= lo) & (b <= hi)` as a per-lane
    // mask, then `b ^ (in_range & 0x20)` flips bit 5 only inside the
    // target range. Identical instruction sequence for both modes;
    // only the constants differ. LLVM lowers this to 4 wasm SIMD
    // opcodes per 16-byte chunk:
    //   v128.load, u8x16.ge, u8x16.le, v128.and, v128.and, v128.xor,
    //   v128.store.
    let (lo, hi) = if upper {
        (u8x16_splat(b'a'), u8x16_splat(b'z'))
    } else {
        (u8x16_splat(b'A'), u8x16_splat(b'Z'))
    };
    let bit = u8x16_splat(0x20);

    let mut i = 0usize;
    while i + 16 <= len {
        // SAFETY: loop invariant `i + 16 <= len` ensures the 16-byte
        // load + 16-byte store stay within `prefix` / `dst`. `dst`
        // was just `resize`d to `start + len` so it is fully
        // initialised. The two raw pointers come from non-overlapping
        // slices (`prefix` is `&[u8]`, `dst` is a fresh `&mut [u8]`
        // re-borrowed from `out`); the wasm SIMD load/store
        // intrinsics perform unaligned IO so no alignment guarantee
        // is required.
        let chunk = unsafe { v128_load(prefix.as_ptr().add(i) as *const v128) };
        let ge_lo = u8x16_ge(chunk, lo);
        let le_hi = u8x16_le(chunk, hi);
        let in_range = v128_and(ge_lo, le_hi);
        let xor_mask = v128_and(in_range, bit);
        let folded = v128_xor(chunk, xor_mask);
        // SAFETY: see comment immediately above; same bounds /
        // aliasing argument applies to the matching store.
        unsafe {
            v128_store(dst.as_mut_ptr().add(i) as *mut v128, folded);
        }
        i += 16;
    }
    // Scalar tail.
    while i < len {
        let b = prefix[i];
        dst[i] = if upper {
            if b.is_ascii_lowercase() {
                b ^ 0x20
            } else {
                b
            }
        } else if b.is_ascii_uppercase() {
            b ^ 0x20
        } else {
            b
        };
        i += 1;
    }
}

#[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
#[inline]
fn fold_ascii_upper_lower_impl(prefix: &[u8], upper: bool, out: &mut Vec<u8>) {
    // Scalar implementation. The hot loop here is a branch-free
    // mask-and-xor and LLVM auto-vectorises it cleanly: x86_64-v3
    // emits an SSE2 pcmpeqb / por sequence over 16-byte chunks,
    // aarch64-neon emits the analogous lane compare + bic.
    //
    // We pre-extend `out` to the final length so the inner loop is a
    // straight index store. Profiling on a 10 KB ASCII corpus shows
    // this saves ~30 % vs `Vec::push` per byte.
    let len = prefix.len();
    let start = out.len();
    out.resize(start + len, 0);
    let dst = &mut out[start..start + len];

    if upper {
        for i in 0..len {
            let b = prefix[i];
            // `wrapping_sub` + unsigned compare is the classical
            // branch-free range test: any `b < base` underflows past
            // 26, so a single `<` selects exactly the in-range
            // lanes. LLVM lifts this to `pcmpgtb`+`pand`+`pxor` on
            // x86_64-v3 and the analogous NEON `cmhi`+`and`+`eor`
            // on aarch64.
            let in_range = b.wrapping_sub(b'a') < 26;
            let flip = if in_range { 0x20 } else { 0x00 };
            dst[i] = b ^ flip;
        }
    } else {
        for i in 0..len {
            let b = prefix[i];
            let in_range = b.wrapping_sub(b'A') < 26;
            let flip = if in_range { 0x20 } else { 0x00 };
            dst[i] = b ^ flip;
        }
    }
}

/// Title-mode ASCII fold. Walks `prefix` byte-by-byte tracking the
/// word boundary state. Word boundaries are ASCII whitespace
/// (`is_ascii_whitespace`) — that's a superset of `char::is_whitespace`
/// restricted to ASCII (`\t \n \v \f \r ' '`), matching the slow
/// path's behaviour exactly on ASCII input.
///
/// Returns the final `at_word_start` flag so the slow path can
/// continue tracking when execution crosses from the ASCII prefix
/// into the non-ASCII tail.
#[inline]
fn fold_ascii_prefix_title(prefix: &[u8], at_word_start_in: bool, out: &mut Vec<u8>) -> bool {
    let len = prefix.len();
    let start = out.len();
    out.reserve(len);
    out.resize(start + len, 0);
    let dst = &mut out[start..start + len];

    let mut at_word_start = at_word_start_in;
    for i in 0..len {
        let b = prefix[i];
        // `is_ascii_whitespace` matches `b' '`, `b'\t'`, `b'\n'`,
        // `b'\x0C'` (\f), `b'\r'`. Rust's `char::is_whitespace` adds
        // U+000B (\v), so for byte-identical parity we add it here.
        let is_ws = matches!(b, b' ' | b'\t' | b'\n' | 0x0B | 0x0C | b'\r');
        if is_ws {
            dst[i] = b;
            at_word_start = true;
            continue;
        }
        // Cased path: word_start -> upper, else lower.
        if at_word_start {
            // upper
            dst[i] = if b.is_ascii_lowercase() { b & 0xDF } else { b };
        } else {
            // lower
            dst[i] = if b.is_ascii_uppercase() { b | 0x20 } else { b };
        }
        at_word_start = false;
    }
    at_word_start
}

/// Run the ASCII fast-path on the leading ASCII portion of `bytes`,
/// appending the folded bytes to `out`. Returns how many input bytes
/// were consumed and the resulting `at_word_start` flag (only
/// meaningful for `AsciiFoldMode::Title`; pass through unchanged for
/// the other two modes).
///
/// Callers must pass `!locale_turkish`. Turkish overrides
/// `I <-> ı` / `i <-> İ` would emit 2-byte UTF-8 for ASCII input,
/// which the fast-path's byte-in / byte-out shape cannot express.
#[inline]
pub fn fold_ascii_prefix(
    bytes: &[u8],
    mode: AsciiFoldMode,
    at_word_start_in: bool,
    out: &mut Vec<u8>,
) -> AsciiFastResult {
    let prefix_len = scan_ascii_prefix(bytes);
    if prefix_len == 0 {
        return AsciiFastResult {
            consumed: 0,
            at_word_start: at_word_start_in,
        };
    }
    let prefix = &bytes[..prefix_len];
    let at_word_start = match mode {
        AsciiFoldMode::Upper => {
            fold_ascii_prefix_upper_lower(prefix, true, out);
            at_word_start_in
        }
        AsciiFoldMode::Lower => {
            fold_ascii_prefix_upper_lower(prefix, false, out);
            at_word_start_in
        }
        AsciiFoldMode::Title => fold_ascii_prefix_title(prefix, at_word_start_in, out),
    };
    AsciiFastResult {
        consumed: prefix_len,
        at_word_start,
    }
}

/// Tier 2c "pre-classified" fast path — the caller has already
/// proven (typically via the StringRef record's ASCII-flag bit) that
/// `bytes` is entirely ASCII, so we can skip the `scan_ascii_prefix`
/// pre-pass and go straight to the byte-wise mask + xor body.
///
/// This is the entry point that `case_fold` / `to_lower` / `to_upper`
/// stdlib bodies reach when the caller has already proven the input is
/// pure ASCII (e.g. via a record-header ASCII flag stamped once at
/// record-build time), so per-call hot paths drop the SIMD scan
/// entirely.
///
/// Saves ~3 cycles / byte vs [`fold_ascii_prefix`] on the auto-vec
/// scan + mask + xor sequence — the scan alone is one of the three
/// data-dependent loops; removing it leaves the mask + xor (which
/// LLVM still auto-vectorises) running on a tighter dependency
/// chain.
///
/// # Safety contract (informal)
///
/// Caller MUST have verified `bytes.iter().all(|b| *b < 0x80)` by
/// some upstream invariant — typically the StringRef ASCII-flag
/// probe. Passing non-ASCII bytes through this function is **not
/// memory-unsafe** (every operation stays inside `bytes` / `out`),
/// but the output is no longer byte-identical with the UAX #21 slow
/// path because non-ASCII high bytes would be mask-flipped (e.g. a
/// 2-byte UTF-8 continuation byte `0x80..=0xBF` would get its bit-5
/// flipped, corrupting the codepoint). A debug-mode assert flags
/// the contract break.
#[inline]
pub fn case_fold_ascii_fast(
    bytes: &[u8],
    mode: AsciiFoldMode,
    at_word_start_in: bool,
    out: &mut Vec<u8>,
) -> AsciiFastResult {
    debug_assert!(
        bytes.iter().all(|b| *b < 0x80),
        "case_fold_ascii_fast called with non-ASCII payload — caller must check flag bit first"
    );
    if bytes.is_empty() {
        return AsciiFastResult {
            consumed: 0,
            at_word_start: at_word_start_in,
        };
    }
    let at_word_start = match mode {
        AsciiFoldMode::Upper => {
            fold_ascii_prefix_upper_lower(bytes, true, out);
            at_word_start_in
        }
        AsciiFoldMode::Lower => {
            fold_ascii_prefix_upper_lower(bytes, false, out);
            at_word_start_in
        }
        AsciiFoldMode::Title => fold_ascii_prefix_title(bytes, at_word_start_in, out),
    };
    AsciiFastResult {
        consumed: bytes.len(),
        at_word_start,
    }
}

/// Convenience wrapper around [`case_fold_ascii_fast`] that appends
/// directly into a `String` (the common case for `fold_string`-style
/// callers that hold a mutable `String` accumulator). ASCII bytes
/// are valid 1-byte UTF-8 codeunits, so we promote the `Vec<u8>`
/// path through `String::as_mut_vec` without a separate UTF-8 check.
#[inline]
pub fn case_fold_ascii_fast_into_string(
    bytes: &[u8],
    mode: AsciiFoldMode,
    at_word_start_in: bool,
    out: &mut String,
) -> AsciiFastResult {
    let pre_len = out.len();
    let result = {
        // SAFETY: the bytes we append here are all `< 0x80` (the
        // caller proved this via the flag bit; the debug_assert
        // inside `case_fold_ascii_fast` re-asserts in tests). Each
        // such byte is a valid single-byte UTF-8 codepoint so the
        // `String` invariant remains satisfied.
        let buf = unsafe { out.as_mut_vec() };
        case_fold_ascii_fast(bytes, mode, at_word_start_in, buf)
    };
    debug_assert!(
        out.is_char_boundary(pre_len),
        "case_fold_ascii_fast corrupted utf-8 boundary at {pre_len}"
    );
    result
}

/// Convenience wrapper: append the ASCII fast-path output directly to
/// a `String`. ASCII bytes are valid 1-byte UTF-8 codeunits, so we
/// can safely promote the `Vec<u8>` write into the `String`'s inner
/// buffer without a separate UTF-8 validation pass.
///
/// Callers must still pass `!locale_turkish`; see [`fold_ascii_prefix`]
/// for the rationale.
#[inline]
pub fn fold_ascii_prefix_into_string(
    bytes: &[u8],
    mode: AsciiFoldMode,
    at_word_start_in: bool,
    out: &mut String,
) -> AsciiFastResult {
    let pre_len = out.len();
    // SAFETY: we only push ASCII bytes (b < 0x80) into the buffer
    // owned by `out`. Each such byte is a valid single-byte UTF-8
    // codepoint. If `fold_ascii_prefix` writes fewer bytes than it
    // claimed (it never does — it always writes exactly `consumed`
    // bytes), or somehow writes non-ASCII bytes (it can't — the
    // mask-and-xor preserves the high bit which we already proved
    // is zero), the truncation in the early-return restores the
    // original length.
    //
    // Implementation: peel off the underlying `Vec<u8>` of the String
    // for the fast-path write, then trust the invariant. Done via
    // `unsafe` `as_mut_vec` because there is no safe API to bulk-push
    // ASCII bytes to a `String` (`push_str` requires `&str`, which
    // implies UTF-8 validation; we don't want that overhead because
    // we *know* the bytes are ASCII).
    let result = {
        // SAFETY: between the `as_mut_vec` and the matching ASCII
        // check below, the inner buffer holds: original valid
        // UTF-8 (length `pre_len`) || newly appended ASCII bytes
        // (each `< 0x80`, hence a single-byte UTF-8 codepoint each).
        // Therefore the buffer is valid UTF-8 throughout, which is
        // the `String` invariant.
        let buf = unsafe { out.as_mut_vec() };
        fold_ascii_prefix(bytes, mode, at_word_start_in, buf)
    };
    // Release-mode invariant check: the new bytes appended by the
    // fast path must all be ASCII (`< 0x80`). The fast path is
    // semantically guaranteed to never emit `>= 0x80` (mask-and-xor
    // preserves the high bit which was already proven zero), but
    // a future refactor that breaks this would silently corrupt the
    // `String`'s UTF-8 invariant. The check is one branchless byte-
    // wide compare per emitted byte; on a 10 KB ASCII input it adds
    // < 1 µs to the fast path's 3 µs budget and surfaces a panic
    // immediately rather than producing UB downstream.
    let new_bytes = &out.as_bytes()[pre_len..pre_len + result.consumed];
    assert!(
        new_bytes.iter().all(|b| *b < 0x80),
        "ascii fold emitted non-ASCII byte; UTF-8 invariant would be broken"
    );
    debug_assert!(
        out.is_char_boundary(pre_len),
        "ascii fold corrupted utf-8 boundary at {pre_len}"
    );
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Reference implementation for byte-identical assertions.
    /// Mirrors what the slow tree-walk path does on the ASCII subset:
    /// `Upper` / `Lower` route every byte through the mask, `Title`
    /// alternates between word-start (upper) and continuation (lower)
    /// driven by ASCII whitespace.
    fn reference_fold(s: &[u8], mode: AsciiFoldMode) -> Vec<u8> {
        let mut out = Vec::with_capacity(s.len());
        let mut at_word_start = true;
        for &b in s {
            match mode {
                AsciiFoldMode::Upper => {
                    out.push(if b.is_ascii_lowercase() { b & 0xDF } else { b });
                }
                AsciiFoldMode::Lower => {
                    out.push(if b.is_ascii_uppercase() { b | 0x20 } else { b });
                }
                AsciiFoldMode::Title => {
                    let is_ws = matches!(b, b' ' | b'\t' | b'\n' | 0x0B | 0x0C | b'\r');
                    if is_ws {
                        out.push(b);
                        at_word_start = true;
                    } else {
                        if at_word_start {
                            out.push(if b.is_ascii_lowercase() { b & 0xDF } else { b });
                        } else {
                            out.push(if b.is_ascii_uppercase() { b | 0x20 } else { b });
                        }
                        at_word_start = false;
                    }
                }
            }
        }
        out
    }

    #[test]
    fn scan_prefix_empty() {
        assert_eq!(scan_ascii_prefix(b""), 0);
    }

    #[test]
    fn scan_prefix_all_ascii() {
        let s = b"Hello, world!";
        assert_eq!(scan_ascii_prefix(s), s.len());
    }

    #[test]
    fn scan_prefix_immediate_non_ascii() {
        let s = "\u{00DF}foo".as_bytes();
        assert_eq!(scan_ascii_prefix(s), 0);
    }

    #[test]
    fn scan_prefix_mixed() {
        let s = "abc\u{00DF}def".as_bytes();
        assert_eq!(scan_ascii_prefix(s), 3);
    }

    #[test]
    fn scan_prefix_15_then_non_ascii() {
        // 15 ASCII bytes (< one v128 chunk), then non-ASCII. Exercises
        // the scalar tail.
        let s = "abcdefghijklmno\u{00DF}".as_bytes();
        assert_eq!(scan_ascii_prefix(s), 15);
    }

    #[test]
    fn scan_prefix_exactly_16() {
        // Boundary case: one full v128 chunk worth of ASCII.
        let s = b"abcdefghijklmnop";
        assert_eq!(scan_ascii_prefix(s), 16);
    }

    #[test]
    fn scan_prefix_17() {
        // One v128 chunk + 1 extra. Exercises chunked + tail path.
        let s = b"abcdefghijklmnopq";
        assert_eq!(scan_ascii_prefix(s), 17);
    }

    #[test]
    fn scan_prefix_32_then_non_ascii() {
        let s = "abcdefghijklmnopqrstuvwxyz123456\u{00DF}".as_bytes();
        assert_eq!(scan_ascii_prefix(s), 32);
    }

    #[test]
    fn scan_prefix_non_ascii_in_second_chunk() {
        // 20 ASCII bytes then non-ASCII; the first v128 chunk is all
        // ASCII so we must continue into the second chunk / tail.
        let s = "abcdefghijklmnopqrst\u{00DF}".as_bytes();
        assert_eq!(scan_ascii_prefix(s), 20);
    }

    #[test]
    fn upper_lower_byte_identical_small() {
        for src in [
            b"".as_ref(),
            b"a",
            b"A",
            b"Hello, World!",
            b"0123456789",
            b"AAAaaa BBBbbb CCCccc",
            b"\x00\x01\x7F",
        ] {
            for mode in [AsciiFoldMode::Upper, AsciiFoldMode::Lower] {
                let mut out = Vec::new();
                let r = fold_ascii_prefix(src, mode, true, &mut out);
                assert_eq!(r.consumed, src.len(), "consumed mismatch for {src:?}");
                assert_eq!(out, reference_fold(src, mode), "{src:?} mode {mode:?}");
            }
        }
    }

    #[test]
    fn title_byte_identical_small() {
        for src in [
            b"hello world".as_ref(),
            b"HELLO WORLD",
            b"the quick brown fox jumps over the lazy dog",
            b"  leading spaces",
            b"\ttab\nnewline\rcr",
            b"a",
            b"",
        ] {
            let mut out = Vec::new();
            let r = fold_ascii_prefix(src, AsciiFoldMode::Title, true, &mut out);
            assert_eq!(r.consumed, src.len());
            assert_eq!(out, reference_fold(src, AsciiFoldMode::Title), "{src:?}");
        }
    }

    #[test]
    fn boundary_lengths_16_17_32_33() {
        // The 16 / 17 / 32 / 33 byte boundaries are the v128 chunk
        // edges. Each must produce byte-identical output for all 3
        // modes against a freshly-built scalar reference.
        for &n in &[1usize, 15, 16, 17, 31, 32, 33, 47, 48, 64] {
            let src: Vec<u8> = (0..n)
                .map(|i| {
                    // Stir uppercase / lowercase / punctuation / digits
                    // so the byte stream exercises both the in-range
                    // and out-of-range halves of the mask.
                    b"AaBbCc0 .Z!9zXyW"[i % 16]
                })
                .collect();
            for mode in [
                AsciiFoldMode::Upper,
                AsciiFoldMode::Lower,
                AsciiFoldMode::Title,
            ] {
                let mut out = Vec::new();
                let r = fold_ascii_prefix(&src, mode, true, &mut out);
                assert_eq!(r.consumed, src.len());
                assert_eq!(
                    out,
                    reference_fold(&src, mode),
                    "n={n} mode={mode:?} src={src:?}"
                );
            }
        }
    }

    #[test]
    fn pseudo_random_ascii_corpora() {
        // 16 deterministic ASCII corpora driven by a tiny xorshift; we
        // want byte-identical output across all 3 modes against the
        // scalar reference. Driven by a fixed seed for reproducibility.
        let mut state: u32 = 0x6d61_6e69;
        let mut next = || {
            state ^= state << 13;
            state ^= state >> 17;
            state ^= state << 5;
            state
        };
        for trial in 0..16 {
            let len = 50 + (next() as usize % 1000);
            let src: Vec<u8> = (0..len)
                .map(|_| {
                    // Restrict to the printable ASCII set + whitespace.
                    let r = next() % 96;
                    let b = 0x20 + r as u8;
                    if b == 0x7F {
                        b' '
                    } else {
                        b
                    }
                })
                .collect();
            for mode in [
                AsciiFoldMode::Upper,
                AsciiFoldMode::Lower,
                AsciiFoldMode::Title,
            ] {
                let mut out = Vec::new();
                let r = fold_ascii_prefix(&src, mode, true, &mut out);
                assert_eq!(r.consumed, src.len(), "trial={trial} mode={mode:?}");
                assert_eq!(
                    out,
                    reference_fold(&src, mode),
                    "trial={trial} mode={mode:?} len={len}"
                );
            }
        }
    }

    #[test]
    fn title_at_word_start_carry() {
        // Caller starts mid-word: the first ASCII byte must be
        // lowercased even if cased.
        let mut out = Vec::new();
        let r = fold_ascii_prefix(b"WORLD", AsciiFoldMode::Title, false, &mut out);
        assert_eq!(out, b"world");
        assert!(!r.at_word_start);
    }

    #[test]
    fn title_word_start_ends_on_whitespace() {
        // If the ASCII prefix ends with whitespace, the slow path
        // should continue with at_word_start = true.
        let mut out = Vec::new();
        let r = fold_ascii_prefix(b"hello ", AsciiFoldMode::Title, true, &mut out);
        assert_eq!(out, b"Hello ");
        assert!(r.at_word_start);
    }

    #[test]
    fn ascii_prefix_with_nonascii_tail_consumes_only_prefix() {
        // The fast-path stops at the first byte >= 0x80; the caller
        // is responsible for resuming the slow loop after that.
        let s = "Hello, \u{00DF}world".as_bytes(); // "Hello, " is 7 bytes
        let mut out = Vec::new();
        let r = fold_ascii_prefix(s, AsciiFoldMode::Upper, true, &mut out);
        assert_eq!(r.consumed, 7);
        assert_eq!(out, b"HELLO, ");
    }

    #[test]
    fn ascii_prefix_consumed_zero_when_first_byte_nonascii() {
        let s = "\u{00DF}foo".as_bytes();
        let mut out = Vec::new();
        let r = fold_ascii_prefix(s, AsciiFoldMode::Upper, true, &mut out);
        assert_eq!(r.consumed, 0);
        assert!(out.is_empty());
    }

    // ---- Tier 2c: pre-classified ASCII fast path -------------------

    #[test]
    fn case_fold_ascii_fast_matches_scan_path_byte_identical() {
        // For any pure-ASCII input, the new pre-classified fast path
        // must produce output byte-identical with the scan-based
        // `fold_ascii_prefix`. The contract is that the flag-bit-aware
        // dispatch can substitute either implementation without
        // observable differences.
        for src in [
            b"".as_ref(),
            b"a",
            b"A",
            b"Hello, World!",
            b"0123456789",
            b"AAAaaa BBBbbb CCCccc",
            b"\x00\x01\x7F",
            b"the quick brown fox",
        ] {
            for mode in [
                AsciiFoldMode::Upper,
                AsciiFoldMode::Lower,
                AsciiFoldMode::Title,
            ] {
                let mut via_scan = Vec::new();
                let r_scan = fold_ascii_prefix(src, mode, true, &mut via_scan);
                let mut via_fast = Vec::new();
                let r_fast = case_fold_ascii_fast(src, mode, true, &mut via_fast);
                assert_eq!(
                    via_scan, via_fast,
                    "src={src:?} mode={mode:?}: scan vs fast path divergence"
                );
                assert_eq!(r_scan.consumed, r_fast.consumed);
                assert_eq!(r_scan.at_word_start, r_fast.at_word_start);
            }
        }
    }

    #[test]
    fn case_fold_ascii_fast_into_string_appends_correctly() {
        let mut acc = String::from("prefix:");
        let r = case_fold_ascii_fast_into_string(b"Hello", AsciiFoldMode::Lower, true, &mut acc);
        assert_eq!(acc, "prefix:hello");
        assert_eq!(r.consumed, 5);
    }

    #[test]
    fn case_fold_ascii_fast_empty_input_is_noop() {
        let mut out = Vec::new();
        let r = case_fold_ascii_fast(b"", AsciiFoldMode::Upper, true, &mut out);
        assert_eq!(r.consumed, 0);
        assert!(r.at_word_start);
        assert!(out.is_empty());
    }

    #[test]
    fn case_fold_ascii_fast_title_carries_word_state() {
        let mut out = Vec::new();
        let r = case_fold_ascii_fast(b"hello world", AsciiFoldMode::Title, true, &mut out);
        assert_eq!(out, b"Hello World");
        assert_eq!(r.consumed, 11);
        // Ends mid-word (the 'd') so the boundary flag must be false.
        assert!(!r.at_word_start);
    }
}