Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Stream buffer: 32MB — tr streaming operations (translate, delete, squeeze)
10/// are compute-light (single table lookup or bitset check per byte), so the
11/// bottleneck is I/O syscalls, not cache pressure. 32MB buffer ensures even
12/// large piped inputs require minimal syscall pairs, and the kernel can
13/// transfer the maximum pipe buffer size per read/write call.
14/// For piped input, read_once processes data immediately for pipelining.
15/// This applies to ALL streaming modes (delete, squeeze, translate).
16const STREAM_BUF: usize = 32 * 1024 * 1024;
17
18/// Minimum data size to engage rayon parallel processing for mmap paths.
19/// AVX2 translation runs at ~10 GB/s per core. For 10MB benchmarks,
20/// rayon overhead (~100-200us for spawn+join) dominates the ~1ms
21/// single-core translate time. Only use parallel for genuinely large files
22/// where the parallel speedup outweighs rayon overhead.
23const PARALLEL_THRESHOLD: usize = 32 * 1024 * 1024;
24
25/// 256-entry lookup table for byte compaction: for each 8-bit keep mask,
26/// stores the bit positions of set bits (indices of bytes to keep).
27/// Used by compact_8bytes to replace the serial trailing_zeros loop with
28/// unconditional indexed stores, eliminating the tzcnt→blsr dependency chain.
29/// Total size: 256 * 8 = 2KB — fits entirely in L1 cache.
30#[cfg(target_arch = "x86_64")]
31static COMPACT_LUT: [[u8; 8]; 256] = {
32    let mut lut = [[0u8; 8]; 256];
33    let mut mask: u16 = 0;
34    while mask < 256 {
35        let mut idx: usize = 0;
36        let mut bit: u8 = 0;
37        while bit < 8 {
38            if (mask >> bit) & 1 != 0 {
39                lut[mask as usize][idx] = bit;
40                idx += 1;
41            }
42            bit += 1;
43        }
44        mask += 1;
45    }
46    lut
47};
48
49/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
50/// Falls back to write_all per slice for partial writes.
51#[inline]
52fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
53    if slices.is_empty() {
54        return Ok(());
55    }
56    for batch in slices.chunks(MAX_IOV) {
57        let total: usize = batch.iter().map(|s| s.len()).sum();
58        match writer.write_vectored(batch) {
59            Ok(n) if n >= total => continue,
60            Ok(mut written) => {
61                // Partial write: fall back to write_all per remaining slice
62                for slice in batch {
63                    let slen = slice.len();
64                    if written >= slen {
65                        written -= slen;
66                        continue;
67                    }
68                    if written > 0 {
69                        writer.write_all(&slice[written..])?;
70                        written = 0;
71                    } else {
72                        writer.write_all(slice)?;
73                    }
74                }
75            }
76            Err(e) => return Err(e),
77        }
78    }
79    Ok(())
80}
81
82/// Allocate a Vec<u8> of given length without zero-initialization.
83/// Uses MADV_HUGEPAGE on Linux for buffers >= 2MB to reduce TLB misses.
84/// SAFETY: Caller must write all bytes before reading them.
85#[inline]
86#[allow(clippy::uninit_vec)]
87fn alloc_uninit_vec(len: usize) -> Vec<u8> {
88    let mut v = Vec::with_capacity(len);
89    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
90    unsafe {
91        v.set_len(len);
92    }
93    #[cfg(target_os = "linux")]
94    if len >= 2 * 1024 * 1024 {
95        unsafe {
96            libc::madvise(
97                v.as_mut_ptr() as *mut libc::c_void,
98                len,
99                libc::MADV_HUGEPAGE,
100            );
101        }
102    }
103    v
104}
105
106/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
107#[inline]
108fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
109    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
110    let last = set2.last().copied();
111    for (i, &from) in set1.iter().enumerate() {
112        table[from as usize] = if i < set2.len() {
113            set2[i]
114        } else {
115            last.unwrap_or(from)
116        };
117    }
118    table
119}
120
121/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
122#[inline]
123fn build_member_set(chars: &[u8]) -> [u8; 32] {
124    let mut set = [0u8; 32];
125    for &ch in chars {
126        set[ch as usize >> 3] |= 1 << (ch & 7);
127    }
128    set
129}
130
131#[inline(always)]
132fn is_member(set: &[u8; 32], ch: u8) -> bool {
133    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
134}
135
136/// Cached SIMD capability level for x86_64.
137/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
138#[cfg(target_arch = "x86_64")]
139static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
140
141#[cfg(target_arch = "x86_64")]
142#[inline(always)]
143fn get_simd_level() -> u8 {
144    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
145    if level != 0 {
146        return level;
147    }
148    let detected = if is_x86_feature_detected!("avx2") {
149        3
150    } else if is_x86_feature_detected!("ssse3") {
151        2
152    } else {
153        1
154    };
155    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
156    detected
157}
158
159/// Count how many entries in the translate table are non-identity.
160#[cfg(target_arch = "x86_64")]
161#[inline]
162fn count_non_identity(table: &[u8; 256]) -> usize {
163    table
164        .iter()
165        .enumerate()
166        .filter(|&(i, &v)| v != i as u8)
167        .count()
168}
169
170/// Translate bytes in-place using a 256-byte lookup table.
171/// For sparse translations (few bytes change), uses SIMD skip-ahead:
172/// compare 32 bytes at a time against identity, skip unchanged chunks.
173/// For dense translations, uses full SIMD nibble decomposition.
174/// Falls back to 8x-unrolled scalar on non-x86_64 platforms.
175#[inline(always)]
176fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
177    #[cfg(target_arch = "x86_64")]
178    {
179        let level = get_simd_level();
180        if level >= 3 {
181            // For sparse translations (<=16 non-identity entries), the skip-ahead
182            // approach is faster: load 32 bytes, do a full nibble lookup, compare
183            // against input, skip store if identical. This avoids writing to pages
184            // that don't change (important for MAP_PRIVATE COW mmap).
185            let non_id = count_non_identity(table);
186            if non_id > 0 && non_id <= 16 {
187                unsafe { translate_inplace_avx2_sparse(data, table) };
188                return;
189            }
190            unsafe { translate_inplace_avx2_table(data, table) };
191            return;
192        }
193        if level >= 2 {
194            unsafe { translate_inplace_ssse3_table(data, table) };
195            return;
196        }
197    }
198    translate_inplace_scalar(data, table);
199}
200
201/// Sparse AVX2 translate: skip unchanged 32-byte chunks.
202/// For each chunk: perform full nibble lookup, compare result vs input.
203/// If identical (no bytes changed), skip the store entirely.
204/// This reduces memory bandwidth and avoids COW page faults for
205/// MAP_PRIVATE mmaps when most bytes are unchanged.
206#[cfg(target_arch = "x86_64")]
207#[target_feature(enable = "avx2")]
208unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
209    use std::arch::x86_64::*;
210
211    unsafe {
212        let len = data.len();
213        let ptr = data.as_mut_ptr();
214
215        // Pre-build 16 lookup vectors (same as full nibble decomposition)
216        let mut lut = [_mm256_setzero_si256(); 16];
217        for h in 0u8..16 {
218            let base = (h as usize) * 16;
219            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
220            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
221            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
222        }
223
224        let lo_mask = _mm256_set1_epi8(0x0F);
225
226        let mut i = 0;
227        while i + 32 <= len {
228            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
229            let lo_nibble = _mm256_and_si256(input, lo_mask);
230            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
231
232            let mut result = _mm256_setzero_si256();
233            macro_rules! do_nibble {
234                ($h:expr) => {
235                    let h_val = _mm256_set1_epi8($h as i8);
236                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
237                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
238                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
239                };
240            }
241            do_nibble!(0);
242            do_nibble!(1);
243            do_nibble!(2);
244            do_nibble!(3);
245            do_nibble!(4);
246            do_nibble!(5);
247            do_nibble!(6);
248            do_nibble!(7);
249            do_nibble!(8);
250            do_nibble!(9);
251            do_nibble!(10);
252            do_nibble!(11);
253            do_nibble!(12);
254            do_nibble!(13);
255            do_nibble!(14);
256            do_nibble!(15);
257
258            // Only store if result differs from input (skip unchanged chunks)
259            let diff = _mm256_xor_si256(input, result);
260            if _mm256_testz_si256(diff, diff) == 0 {
261                _mm256_storeu_si256(ptr.add(i) as *mut _, result);
262            }
263            i += 32;
264        }
265
266        // Scalar tail
267        while i < len {
268            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
269            i += 1;
270        }
271    }
272}
273
274/// Scalar fallback: 8x-unrolled table lookup.
275#[cfg(not(target_arch = "aarch64"))]
276#[inline(always)]
277fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
278    let len = data.len();
279    let ptr = data.as_mut_ptr();
280    let mut i = 0;
281    unsafe {
282        while i + 8 <= len {
283            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
284            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
285            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
286            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
287            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
288            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
289            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
290            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
291            i += 8;
292        }
293        while i < len {
294            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
295            i += 1;
296        }
297    }
298}
299
300/// ARM64 NEON table lookup using nibble decomposition (same algorithm as x86 pshufb).
301/// Uses vqtbl1q_u8 for 16-byte table lookups, processes 16 bytes per iteration.
302#[cfg(target_arch = "aarch64")]
303#[inline(always)]
304fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
305    unsafe { translate_inplace_neon_table(data, table) };
306}
307
308#[cfg(target_arch = "aarch64")]
309#[target_feature(enable = "neon")]
310unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
311    use std::arch::aarch64::*;
312
313    unsafe {
314        let len = data.len();
315        let ptr = data.as_mut_ptr();
316
317        // Pre-build 16 NEON lookup vectors (one per high nibble)
318        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
319        for h in 0u8..16 {
320            let base = (h as usize) * 16;
321            lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
322        }
323
324        let lo_mask = vdupq_n_u8(0x0F);
325        let mut i = 0;
326
327        while i + 16 <= len {
328            let input = vld1q_u8(ptr.add(i));
329            let lo_nibble = vandq_u8(input, lo_mask);
330            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
331
332            let mut result = vdupq_n_u8(0);
333            macro_rules! do_nibble {
334                ($h:expr) => {
335                    let h_val = vdupq_n_u8($h);
336                    let mask = vceqq_u8(hi_nibble, h_val);
337                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
338                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
339                };
340            }
341            do_nibble!(0);
342            do_nibble!(1);
343            do_nibble!(2);
344            do_nibble!(3);
345            do_nibble!(4);
346            do_nibble!(5);
347            do_nibble!(6);
348            do_nibble!(7);
349            do_nibble!(8);
350            do_nibble!(9);
351            do_nibble!(10);
352            do_nibble!(11);
353            do_nibble!(12);
354            do_nibble!(13);
355            do_nibble!(14);
356            do_nibble!(15);
357
358            vst1q_u8(ptr.add(i), result);
359            i += 16;
360        }
361
362        // Scalar tail
363        while i < len {
364            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
365            i += 1;
366        }
367    }
368}
369
370// ============================================================================
371// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
372// ============================================================================
373//
374// For an arbitrary 256-byte lookup table, we decompose each byte into
375// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
376// SIMD vectors, one for each high nibble value h (0..15), containing
377// the 16 table entries table[h*16+0..h*16+15]. Then for each input
378// vector we:
379//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
380//   2. Extract high nibble (shift right 4) -> used to select which table
381//   3. For each of the 16 high nibble values, create a mask where
382//      the high nibble equals that value, pshufb the corresponding
383//      table, and accumulate results
384//
385// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
386// With instruction-level parallelism, this achieves much higher throughput
387// than scalar table lookups which have serial data dependencies.
388
389#[cfg(target_arch = "x86_64")]
390#[target_feature(enable = "avx2")]
391unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
392    use std::arch::x86_64::*;
393
394    unsafe {
395        let len = data.len();
396        let ptr = data.as_mut_ptr();
397
398        // Pre-build 16 lookup vectors, one per high nibble value.
399        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
400        // 16 table entries for pshufb indexing by low nibble.
401        let mut lut = [_mm256_setzero_si256(); 16];
402        for h in 0u8..16 {
403            let base = (h as usize) * 16;
404            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
405            // Broadcast the 128-bit row to both lanes of the 256-bit vector
406            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
407            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
408        }
409
410        let lo_mask = _mm256_set1_epi8(0x0F);
411
412        let mut i = 0;
413
414        // 2x unrolled: process 64 bytes (2x32) per iteration for better ILP.
415        // The CPU can overlap load/compute of the second vector while the first
416        // is in the nibble decomposition pipeline.
417        while i + 64 <= len {
418            let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
419            let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
420
421            let lo0 = _mm256_and_si256(input0, lo_mask);
422            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
423            let lo1 = _mm256_and_si256(input1, lo_mask);
424            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
425
426            let mut r0 = _mm256_setzero_si256();
427            let mut r1 = _mm256_setzero_si256();
428
429            macro_rules! do_nibble2 {
430                ($h:expr) => {
431                    let h_val = _mm256_set1_epi8($h as i8);
432                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
433                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
434                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
435                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
436                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
437                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
438                };
439            }
440            do_nibble2!(0);
441            do_nibble2!(1);
442            do_nibble2!(2);
443            do_nibble2!(3);
444            do_nibble2!(4);
445            do_nibble2!(5);
446            do_nibble2!(6);
447            do_nibble2!(7);
448            do_nibble2!(8);
449            do_nibble2!(9);
450            do_nibble2!(10);
451            do_nibble2!(11);
452            do_nibble2!(12);
453            do_nibble2!(13);
454            do_nibble2!(14);
455            do_nibble2!(15);
456
457            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
458            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
459            i += 64;
460        }
461
462        // Remaining 32-byte chunk
463        if i + 32 <= len {
464            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
465            let lo_nibble = _mm256_and_si256(input, lo_mask);
466            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
467
468            let mut result = _mm256_setzero_si256();
469
470            macro_rules! do_nibble {
471                ($h:expr) => {
472                    let h_val = _mm256_set1_epi8($h as i8);
473                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
474                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
475                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
476                };
477            }
478            do_nibble!(0);
479            do_nibble!(1);
480            do_nibble!(2);
481            do_nibble!(3);
482            do_nibble!(4);
483            do_nibble!(5);
484            do_nibble!(6);
485            do_nibble!(7);
486            do_nibble!(8);
487            do_nibble!(9);
488            do_nibble!(10);
489            do_nibble!(11);
490            do_nibble!(12);
491            do_nibble!(13);
492            do_nibble!(14);
493            do_nibble!(15);
494
495            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
496            i += 32;
497        }
498
499        // SSE/SSSE3 tail for remaining 16-byte chunk
500        if i + 16 <= len {
501            let lo_mask128 = _mm_set1_epi8(0x0F);
502
503            let mut lut128 = [_mm_setzero_si128(); 16];
504            for h in 0u8..16 {
505                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
506            }
507
508            let input = _mm_loadu_si128(ptr.add(i) as *const _);
509            let lo_nib = _mm_and_si128(input, lo_mask128);
510            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
511
512            let mut res = _mm_setzero_si128();
513            macro_rules! do_nibble128 {
514                ($h:expr) => {
515                    let h_val = _mm_set1_epi8($h as i8);
516                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
517                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
518                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
519                };
520            }
521            do_nibble128!(0);
522            do_nibble128!(1);
523            do_nibble128!(2);
524            do_nibble128!(3);
525            do_nibble128!(4);
526            do_nibble128!(5);
527            do_nibble128!(6);
528            do_nibble128!(7);
529            do_nibble128!(8);
530            do_nibble128!(9);
531            do_nibble128!(10);
532            do_nibble128!(11);
533            do_nibble128!(12);
534            do_nibble128!(13);
535            do_nibble128!(14);
536            do_nibble128!(15);
537
538            _mm_storeu_si128(ptr.add(i) as *mut _, res);
539            i += 16;
540        }
541
542        // Scalar tail
543        while i < len {
544            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
545            i += 1;
546        }
547    }
548}
549
550#[cfg(target_arch = "x86_64")]
551#[target_feature(enable = "ssse3")]
552unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
553    use std::arch::x86_64::*;
554
555    unsafe {
556        let len = data.len();
557        let ptr = data.as_mut_ptr();
558
559        // Pre-build 16 lookup vectors for pshufb
560        let mut lut = [_mm_setzero_si128(); 16];
561        for h in 0u8..16 {
562            let base = (h as usize) * 16;
563            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
564            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
565        }
566
567        let lo_mask = _mm_set1_epi8(0x0F);
568
569        let mut i = 0;
570        while i + 16 <= len {
571            let input = _mm_loadu_si128(ptr.add(i) as *const _);
572            let lo_nibble = _mm_and_si128(input, lo_mask);
573            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
574
575            let mut result = _mm_setzero_si128();
576
577            macro_rules! do_nibble {
578                ($h:expr) => {
579                    let h_val = _mm_set1_epi8($h as i8);
580                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
581                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
582                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
583                };
584            }
585            do_nibble!(0);
586            do_nibble!(1);
587            do_nibble!(2);
588            do_nibble!(3);
589            do_nibble!(4);
590            do_nibble!(5);
591            do_nibble!(6);
592            do_nibble!(7);
593            do_nibble!(8);
594            do_nibble!(9);
595            do_nibble!(10);
596            do_nibble!(11);
597            do_nibble!(12);
598            do_nibble!(13);
599            do_nibble!(14);
600            do_nibble!(15);
601
602            _mm_storeu_si128(ptr.add(i) as *mut _, result);
603            i += 16;
604        }
605
606        // Scalar tail
607        while i < len {
608            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
609            i += 1;
610        }
611    }
612}
613
614/// Translate bytes from source to destination using a 256-byte lookup table.
615/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
616#[inline(always)]
617fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
618    debug_assert!(dst.len() >= src.len());
619    #[cfg(target_arch = "x86_64")]
620    {
621        let level = get_simd_level();
622        if level >= 3 {
623            // Use nontemporal stores when dst is 32-byte aligned (large Vec allocations)
624            if dst.as_ptr() as usize & 31 == 0 {
625                unsafe { translate_to_avx2_table_nt(src, dst, table) };
626            } else {
627                unsafe { translate_to_avx2_table(src, dst, table) };
628            }
629            return;
630        }
631        if level >= 2 {
632            unsafe { translate_to_ssse3_table(src, dst, table) };
633            return;
634        }
635    }
636    translate_to_scalar(src, dst, table);
637}
638
639/// Scalar fallback for translate_to.
640#[cfg(not(target_arch = "aarch64"))]
641#[inline(always)]
642fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
643    unsafe {
644        let sp = src.as_ptr();
645        let dp = dst.as_mut_ptr();
646        let len = src.len();
647        let mut i = 0;
648        while i + 8 <= len {
649            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
650            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
651            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
652            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
653            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
654            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
655            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
656            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
657            i += 8;
658        }
659        while i < len {
660            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
661            i += 1;
662        }
663    }
664}
665
666/// ARM64 NEON table-lookup translate_to using nibble decomposition.
667#[cfg(target_arch = "aarch64")]
668#[inline(always)]
669fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
670    unsafe { translate_to_neon_table(src, dst, table) };
671}
672
673#[cfg(target_arch = "aarch64")]
674#[target_feature(enable = "neon")]
675unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
676    use std::arch::aarch64::*;
677
678    unsafe {
679        let len = src.len();
680        let sp = src.as_ptr();
681        let dp = dst.as_mut_ptr();
682
683        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
684        for h in 0u8..16 {
685            lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
686        }
687
688        let lo_mask = vdupq_n_u8(0x0F);
689        let mut i = 0;
690
691        while i + 16 <= len {
692            let input = vld1q_u8(sp.add(i));
693            let lo_nibble = vandq_u8(input, lo_mask);
694            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
695
696            let mut result = vdupq_n_u8(0);
697            macro_rules! do_nibble {
698                ($h:expr) => {
699                    let h_val = vdupq_n_u8($h);
700                    let mask = vceqq_u8(hi_nibble, h_val);
701                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
702                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
703                };
704            }
705            do_nibble!(0);
706            do_nibble!(1);
707            do_nibble!(2);
708            do_nibble!(3);
709            do_nibble!(4);
710            do_nibble!(5);
711            do_nibble!(6);
712            do_nibble!(7);
713            do_nibble!(8);
714            do_nibble!(9);
715            do_nibble!(10);
716            do_nibble!(11);
717            do_nibble!(12);
718            do_nibble!(13);
719            do_nibble!(14);
720            do_nibble!(15);
721
722            vst1q_u8(dp.add(i), result);
723            i += 16;
724        }
725
726        while i < len {
727            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
728            i += 1;
729        }
730    }
731}
732
733#[cfg(target_arch = "x86_64")]
734#[target_feature(enable = "avx2")]
735unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
736    use std::arch::x86_64::*;
737
738    unsafe {
739        let len = src.len();
740        let sp = src.as_ptr();
741        let dp = dst.as_mut_ptr();
742
743        // Pre-build 16 lookup vectors
744        let mut lut = [_mm256_setzero_si256(); 16];
745        for h in 0u8..16 {
746            let base = (h as usize) * 16;
747            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
748            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
749            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
750        }
751
752        let lo_mask = _mm256_set1_epi8(0x0F);
753
754        let mut i = 0;
755
756        // 2x unrolled: process 64 bytes per iteration for better ILP
757        while i + 64 <= len {
758            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
759            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
760
761            let lo0 = _mm256_and_si256(input0, lo_mask);
762            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
763            let lo1 = _mm256_and_si256(input1, lo_mask);
764            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
765
766            let mut r0 = _mm256_setzero_si256();
767            let mut r1 = _mm256_setzero_si256();
768
769            macro_rules! do_nibble2 {
770                ($h:expr) => {
771                    let h_val = _mm256_set1_epi8($h as i8);
772                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
773                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
774                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
775                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
776                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
777                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
778                };
779            }
780            do_nibble2!(0);
781            do_nibble2!(1);
782            do_nibble2!(2);
783            do_nibble2!(3);
784            do_nibble2!(4);
785            do_nibble2!(5);
786            do_nibble2!(6);
787            do_nibble2!(7);
788            do_nibble2!(8);
789            do_nibble2!(9);
790            do_nibble2!(10);
791            do_nibble2!(11);
792            do_nibble2!(12);
793            do_nibble2!(13);
794            do_nibble2!(14);
795            do_nibble2!(15);
796
797            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
798            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
799            i += 64;
800        }
801
802        // Remaining 32-byte chunk
803        if i + 32 <= len {
804            let input = _mm256_loadu_si256(sp.add(i) as *const _);
805            let lo_nibble = _mm256_and_si256(input, lo_mask);
806            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
807
808            let mut result = _mm256_setzero_si256();
809
810            macro_rules! do_nibble {
811                ($h:expr) => {
812                    let h_val = _mm256_set1_epi8($h as i8);
813                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
814                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
815                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
816                };
817            }
818            do_nibble!(0);
819            do_nibble!(1);
820            do_nibble!(2);
821            do_nibble!(3);
822            do_nibble!(4);
823            do_nibble!(5);
824            do_nibble!(6);
825            do_nibble!(7);
826            do_nibble!(8);
827            do_nibble!(9);
828            do_nibble!(10);
829            do_nibble!(11);
830            do_nibble!(12);
831            do_nibble!(13);
832            do_nibble!(14);
833            do_nibble!(15);
834
835            _mm256_storeu_si256(dp.add(i) as *mut _, result);
836            i += 32;
837        }
838
839        // SSSE3 tail for remaining 16-byte chunk
840        if i + 16 <= len {
841            let lo_mask128 = _mm_set1_epi8(0x0F);
842            let mut lut128 = [_mm_setzero_si128(); 16];
843            for h in 0u8..16 {
844                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
845            }
846
847            let input = _mm_loadu_si128(sp.add(i) as *const _);
848            let lo_nib = _mm_and_si128(input, lo_mask128);
849            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
850
851            let mut res = _mm_setzero_si128();
852            macro_rules! do_nibble128 {
853                ($h:expr) => {
854                    let h_val = _mm_set1_epi8($h as i8);
855                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
856                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
857                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
858                };
859            }
860            do_nibble128!(0);
861            do_nibble128!(1);
862            do_nibble128!(2);
863            do_nibble128!(3);
864            do_nibble128!(4);
865            do_nibble128!(5);
866            do_nibble128!(6);
867            do_nibble128!(7);
868            do_nibble128!(8);
869            do_nibble128!(9);
870            do_nibble128!(10);
871            do_nibble128!(11);
872            do_nibble128!(12);
873            do_nibble128!(13);
874            do_nibble128!(14);
875            do_nibble128!(15);
876
877            _mm_storeu_si128(dp.add(i) as *mut _, res);
878            i += 16;
879        }
880
881        // Scalar tail
882        while i < len {
883            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
884            i += 1;
885        }
886    }
887}
888
889/// Nontemporal variant of translate_to_avx2_table: uses _mm256_stream_si256 for stores.
890/// Avoids RFO cache traffic for the destination buffer in streaming translate operations.
891#[cfg(target_arch = "x86_64")]
892#[target_feature(enable = "avx2")]
893unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
894    use std::arch::x86_64::*;
895
896    unsafe {
897        let len = src.len();
898        let sp = src.as_ptr();
899        let dp = dst.as_mut_ptr();
900
901        // Pre-build 16 lookup vectors
902        let mut lut = [_mm256_setzero_si256(); 16];
903        for h in 0u8..16 {
904            let base = (h as usize) * 16;
905            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
906            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
907            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
908        }
909
910        let lo_mask = _mm256_set1_epi8(0x0F);
911        let mut i = 0;
912
913        // 2x unrolled with nontemporal stores
914        while i + 64 <= len {
915            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
916            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
917
918            let lo0 = _mm256_and_si256(input0, lo_mask);
919            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
920            let lo1 = _mm256_and_si256(input1, lo_mask);
921            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
922
923            let mut r0 = _mm256_setzero_si256();
924            let mut r1 = _mm256_setzero_si256();
925
926            macro_rules! do_nibble2 {
927                ($h:expr) => {
928                    let h_val = _mm256_set1_epi8($h as i8);
929                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
930                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
931                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
932                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
933                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
934                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
935                };
936            }
937            do_nibble2!(0);
938            do_nibble2!(1);
939            do_nibble2!(2);
940            do_nibble2!(3);
941            do_nibble2!(4);
942            do_nibble2!(5);
943            do_nibble2!(6);
944            do_nibble2!(7);
945            do_nibble2!(8);
946            do_nibble2!(9);
947            do_nibble2!(10);
948            do_nibble2!(11);
949            do_nibble2!(12);
950            do_nibble2!(13);
951            do_nibble2!(14);
952            do_nibble2!(15);
953
954            _mm256_stream_si256(dp.add(i) as *mut _, r0);
955            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
956            i += 64;
957        }
958
959        // Remaining 32-byte chunk
960        if i + 32 <= len {
961            let input = _mm256_loadu_si256(sp.add(i) as *const _);
962            let lo_nibble = _mm256_and_si256(input, lo_mask);
963            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
964
965            let mut result = _mm256_setzero_si256();
966            macro_rules! do_nibble {
967                ($h:expr) => {
968                    let h_val = _mm256_set1_epi8($h as i8);
969                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
970                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
971                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
972                };
973            }
974            do_nibble!(0);
975            do_nibble!(1);
976            do_nibble!(2);
977            do_nibble!(3);
978            do_nibble!(4);
979            do_nibble!(5);
980            do_nibble!(6);
981            do_nibble!(7);
982            do_nibble!(8);
983            do_nibble!(9);
984            do_nibble!(10);
985            do_nibble!(11);
986            do_nibble!(12);
987            do_nibble!(13);
988            do_nibble!(14);
989            do_nibble!(15);
990
991            _mm256_stream_si256(dp.add(i) as *mut _, result);
992            i += 32;
993        }
994
995        // SSSE3 tail for remaining 16-byte chunk (regular store)
996        if i + 16 <= len {
997            let lo_mask128 = _mm_set1_epi8(0x0F);
998            let mut lut128 = [_mm_setzero_si128(); 16];
999            for h in 0u8..16 {
1000                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
1001            }
1002
1003            let input = _mm_loadu_si128(sp.add(i) as *const _);
1004            let lo_nib = _mm_and_si128(input, lo_mask128);
1005            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1006
1007            let mut res = _mm_setzero_si128();
1008            macro_rules! do_nibble128 {
1009                ($h:expr) => {
1010                    let h_val = _mm_set1_epi8($h as i8);
1011                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1012                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1013                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1014                };
1015            }
1016            do_nibble128!(0);
1017            do_nibble128!(1);
1018            do_nibble128!(2);
1019            do_nibble128!(3);
1020            do_nibble128!(4);
1021            do_nibble128!(5);
1022            do_nibble128!(6);
1023            do_nibble128!(7);
1024            do_nibble128!(8);
1025            do_nibble128!(9);
1026            do_nibble128!(10);
1027            do_nibble128!(11);
1028            do_nibble128!(12);
1029            do_nibble128!(13);
1030            do_nibble128!(14);
1031            do_nibble128!(15);
1032
1033            _mm_storeu_si128(dp.add(i) as *mut _, res);
1034            i += 16;
1035        }
1036
1037        // Scalar tail
1038        while i < len {
1039            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1040            i += 1;
1041        }
1042
1043        // Fence: ensure nontemporal stores are visible before write() syscall
1044        _mm_sfence();
1045    }
1046}
1047
1048#[cfg(target_arch = "x86_64")]
1049#[target_feature(enable = "ssse3")]
1050unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1051    use std::arch::x86_64::*;
1052
1053    unsafe {
1054        let len = src.len();
1055        let sp = src.as_ptr();
1056        let dp = dst.as_mut_ptr();
1057
1058        let mut lut = [_mm_setzero_si128(); 16];
1059        for h in 0u8..16 {
1060            let base = (h as usize) * 16;
1061            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1062            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1063        }
1064
1065        let lo_mask = _mm_set1_epi8(0x0F);
1066
1067        let mut i = 0;
1068        while i + 16 <= len {
1069            let input = _mm_loadu_si128(sp.add(i) as *const _);
1070            let lo_nibble = _mm_and_si128(input, lo_mask);
1071            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1072
1073            let mut result = _mm_setzero_si128();
1074
1075            macro_rules! do_nibble {
1076                ($h:expr) => {
1077                    let h_val = _mm_set1_epi8($h as i8);
1078                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1079                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1080                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1081                };
1082            }
1083            do_nibble!(0);
1084            do_nibble!(1);
1085            do_nibble!(2);
1086            do_nibble!(3);
1087            do_nibble!(4);
1088            do_nibble!(5);
1089            do_nibble!(6);
1090            do_nibble!(7);
1091            do_nibble!(8);
1092            do_nibble!(9);
1093            do_nibble!(10);
1094            do_nibble!(11);
1095            do_nibble!(12);
1096            do_nibble!(13);
1097            do_nibble!(14);
1098            do_nibble!(15);
1099
1100            _mm_storeu_si128(dp.add(i) as *mut _, result);
1101            i += 16;
1102        }
1103
1104        // Scalar tail
1105        while i < len {
1106            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1107            i += 1;
1108        }
1109    }
1110}
1111
1112// ============================================================================
1113// SIMD range translation (x86_64)
1114// ============================================================================
1115
1116/// Detect if the translate table is a single contiguous range with constant offset.
1117/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
1118/// table[i] = i + offset for all i in [lo, hi].
1119#[inline]
1120fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1121    let mut lo: Option<u8> = None;
1122    let mut hi = 0u8;
1123    let mut offset = 0i16;
1124
1125    for i in 0..256 {
1126        if table[i] != i as u8 {
1127            let diff = table[i] as i16 - i as i16;
1128            match lo {
1129                None => {
1130                    lo = Some(i as u8);
1131                    hi = i as u8;
1132                    offset = diff;
1133                }
1134                Some(_) => {
1135                    if diff != offset || i as u8 != hi.wrapping_add(1) {
1136                        return None;
1137                    }
1138                    hi = i as u8;
1139                }
1140            }
1141        }
1142    }
1143
1144    lo.map(|l| (l, hi, offset as i8))
1145}
1146
1147/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
1148/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
1149/// a range maps to one replacement character.
1150/// Returns Some((lo, hi, replacement)) if the pattern matches.
1151#[inline]
1152fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1153    let mut lo: Option<u8> = None;
1154    let mut hi = 0u8;
1155    let mut replacement = 0u8;
1156
1157    for i in 0..256 {
1158        if table[i] != i as u8 {
1159            match lo {
1160                None => {
1161                    lo = Some(i as u8);
1162                    hi = i as u8;
1163                    replacement = table[i];
1164                }
1165                Some(_) => {
1166                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1167                        return None;
1168                    }
1169                    hi = i as u8;
1170                }
1171            }
1172        }
1173    }
1174
1175    lo.map(|l| (l, hi, replacement))
1176}
1177
1178/// SIMD-accelerated range-to-constant translation.
1179/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
1180/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
1181/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
1182#[cfg(target_arch = "x86_64")]
1183fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1184    if get_simd_level() >= 3 {
1185        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1186    } else {
1187        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1188    }
1189}
1190
1191#[cfg(target_arch = "x86_64")]
1192#[target_feature(enable = "avx2")]
1193unsafe fn translate_range_to_constant_avx2_inplace(
1194    data: &mut [u8],
1195    lo: u8,
1196    hi: u8,
1197    replacement: u8,
1198) {
1199    use std::arch::x86_64::*;
1200
1201    unsafe {
1202        let range = hi - lo;
1203        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1204        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1205        let repl_v = _mm256_set1_epi8(replacement as i8);
1206        let zero = _mm256_setzero_si256();
1207
1208        let len = data.len();
1209        let ptr = data.as_mut_ptr();
1210        let mut i = 0;
1211
1212        // 2x unrolled: process 64 bytes per iteration for better ILP
1213        while i + 64 <= len {
1214            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1215            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1216            let bi0 = _mm256_add_epi8(in0, bias_v);
1217            let bi1 = _mm256_add_epi8(in1, bias_v);
1218            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1219            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1220            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1221            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1222            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1223            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1224            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1225            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1226            i += 64;
1227        }
1228
1229        // Remaining 32-byte chunk
1230        if i + 32 <= len {
1231            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1232            let biased = _mm256_add_epi8(input, bias_v);
1233            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1234            let in_range = _mm256_cmpeq_epi8(gt, zero);
1235            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1236            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1237            i += 32;
1238        }
1239
1240        if i + 16 <= len {
1241            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1242            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1243            let repl_v128 = _mm_set1_epi8(replacement as i8);
1244            let zero128 = _mm_setzero_si128();
1245
1246            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1247            let biased = _mm_add_epi8(input, bias_v128);
1248            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1249            let in_range = _mm_cmpeq_epi8(gt, zero128);
1250            let result = _mm_blendv_epi8(input, repl_v128, in_range);
1251            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1252            i += 16;
1253        }
1254
1255        while i < len {
1256            let b = *ptr.add(i);
1257            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1258            i += 1;
1259        }
1260    }
1261}
1262
1263#[cfg(target_arch = "x86_64")]
1264#[target_feature(enable = "sse2")]
1265unsafe fn translate_range_to_constant_sse2_inplace(
1266    data: &mut [u8],
1267    lo: u8,
1268    hi: u8,
1269    replacement: u8,
1270) {
1271    use std::arch::x86_64::*;
1272
1273    unsafe {
1274        let range = hi - lo;
1275        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1276        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1277        let repl_v = _mm_set1_epi8(replacement as i8);
1278        let zero = _mm_setzero_si128();
1279
1280        let len = data.len();
1281        let ptr = data.as_mut_ptr();
1282        let mut i = 0;
1283
1284        while i + 16 <= len {
1285            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1286            let biased = _mm_add_epi8(input, bias_v);
1287            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1288            // in_range mask: 0xFF where in range, 0x00 where not
1289            let in_range = _mm_cmpeq_epi8(gt, zero);
1290            // SSE2 blendv: (repl & mask) | (input & ~mask)
1291            let result = _mm_or_si128(
1292                _mm_and_si128(in_range, repl_v),
1293                _mm_andnot_si128(in_range, input),
1294            );
1295            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1296            i += 16;
1297        }
1298
1299        while i < len {
1300            let b = *ptr.add(i);
1301            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1302            i += 1;
1303        }
1304    }
1305}
1306
1307#[cfg(target_arch = "aarch64")]
1308fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1309    unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1310}
1311
1312#[cfg(target_arch = "aarch64")]
1313#[target_feature(enable = "neon")]
1314unsafe fn translate_range_to_constant_neon_inplace(
1315    data: &mut [u8],
1316    lo: u8,
1317    hi: u8,
1318    replacement: u8,
1319) {
1320    use std::arch::aarch64::*;
1321
1322    unsafe {
1323        let len = data.len();
1324        let ptr = data.as_mut_ptr();
1325        let lo_v = vdupq_n_u8(lo);
1326        let hi_v = vdupq_n_u8(hi);
1327        let repl_v = vdupq_n_u8(replacement);
1328        let mut i = 0;
1329
1330        while i + 32 <= len {
1331            let in0 = vld1q_u8(ptr.add(i));
1332            let in1 = vld1q_u8(ptr.add(i + 16));
1333            let ge0 = vcgeq_u8(in0, lo_v);
1334            let le0 = vcleq_u8(in0, hi_v);
1335            let mask0 = vandq_u8(ge0, le0);
1336            let ge1 = vcgeq_u8(in1, lo_v);
1337            let le1 = vcleq_u8(in1, hi_v);
1338            let mask1 = vandq_u8(ge1, le1);
1339            // bsl: select repl where mask, keep input where not
1340            vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1341            vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1342            i += 32;
1343        }
1344
1345        if i + 16 <= len {
1346            let input = vld1q_u8(ptr.add(i));
1347            let ge = vcgeq_u8(input, lo_v);
1348            let le = vcleq_u8(input, hi_v);
1349            let mask = vandq_u8(ge, le);
1350            vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1351            i += 16;
1352        }
1353
1354        while i < len {
1355            let b = *ptr.add(i);
1356            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1357            i += 1;
1358        }
1359    }
1360}
1361
1362#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1363fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1364    for b in data.iter_mut() {
1365        if *b >= lo && *b <= hi {
1366            *b = replacement;
1367        }
1368    }
1369}
1370
1371/// SIMD range-to-constant translation from src to dst (no intermediate copy needed).
1372/// Reads from src, writes translated result to dst in a single pass.
1373#[cfg(target_arch = "x86_64")]
1374fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1375    if get_simd_level() >= 3 {
1376        unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1377    } else {
1378        unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1379    }
1380}
1381
1382#[cfg(target_arch = "aarch64")]
1383fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1384    unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1385}
1386
1387#[cfg(target_arch = "aarch64")]
1388#[target_feature(enable = "neon")]
1389unsafe fn translate_range_to_constant_neon(
1390    src: &[u8],
1391    dst: &mut [u8],
1392    lo: u8,
1393    hi: u8,
1394    replacement: u8,
1395) {
1396    use std::arch::aarch64::*;
1397
1398    unsafe {
1399        let len = src.len();
1400        let sp = src.as_ptr();
1401        let dp = dst.as_mut_ptr();
1402        let lo_v = vdupq_n_u8(lo);
1403        let hi_v = vdupq_n_u8(hi);
1404        let repl_v = vdupq_n_u8(replacement);
1405        let mut i = 0;
1406
1407        while i + 32 <= len {
1408            let in0 = vld1q_u8(sp.add(i));
1409            let in1 = vld1q_u8(sp.add(i + 16));
1410            let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1411            let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1412            vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1413            vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1414            i += 32;
1415        }
1416
1417        if i + 16 <= len {
1418            let input = vld1q_u8(sp.add(i));
1419            let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1420            vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1421            i += 16;
1422        }
1423
1424        while i < len {
1425            let b = *sp.add(i);
1426            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1427            i += 1;
1428        }
1429    }
1430}
1431
1432#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1433fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1434    for (i, &b) in src.iter().enumerate() {
1435        unsafe {
1436            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1437        }
1438    }
1439}
1440
1441#[cfg(target_arch = "x86_64")]
1442#[target_feature(enable = "avx2")]
1443unsafe fn translate_range_to_constant_avx2(
1444    src: &[u8],
1445    dst: &mut [u8],
1446    lo: u8,
1447    hi: u8,
1448    replacement: u8,
1449) {
1450    use std::arch::x86_64::*;
1451    unsafe {
1452        let range = hi - lo;
1453        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1454        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1455        let repl_v = _mm256_set1_epi8(replacement as i8);
1456        let zero = _mm256_setzero_si256();
1457        let len = src.len();
1458        let sp = src.as_ptr();
1459        let dp = dst.as_mut_ptr();
1460        let mut i = 0;
1461        while i + 64 <= len {
1462            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1463            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1464            let bi0 = _mm256_add_epi8(in0, bias_v);
1465            let bi1 = _mm256_add_epi8(in1, bias_v);
1466            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1467            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1468            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1469            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1470            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1471            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1472            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1473            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1474            i += 64;
1475        }
1476        if i + 32 <= len {
1477            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1478            let biased = _mm256_add_epi8(input, bias_v);
1479            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1480            let in_range = _mm256_cmpeq_epi8(gt, zero);
1481            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1482            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1483            i += 32;
1484        }
1485        while i < len {
1486            let b = *sp.add(i);
1487            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1488            i += 1;
1489        }
1490    }
1491}
1492
1493#[cfg(target_arch = "x86_64")]
1494#[target_feature(enable = "sse2")]
1495unsafe fn translate_range_to_constant_sse2(
1496    src: &[u8],
1497    dst: &mut [u8],
1498    lo: u8,
1499    hi: u8,
1500    replacement: u8,
1501) {
1502    use std::arch::x86_64::*;
1503    unsafe {
1504        let range = hi - lo;
1505        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1506        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1507        let repl_v = _mm_set1_epi8(replacement as i8);
1508        let zero = _mm_setzero_si128();
1509        let len = src.len();
1510        let sp = src.as_ptr();
1511        let dp = dst.as_mut_ptr();
1512        let mut i = 0;
1513        while i + 16 <= len {
1514            let input = _mm_loadu_si128(sp.add(i) as *const _);
1515            let biased = _mm_add_epi8(input, bias_v);
1516            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1517            let in_range = _mm_cmpeq_epi8(gt, zero);
1518            let result = _mm_or_si128(
1519                _mm_and_si128(in_range, repl_v),
1520                _mm_andnot_si128(in_range, input),
1521            );
1522            _mm_storeu_si128(dp.add(i) as *mut _, result);
1523            i += 16;
1524        }
1525        while i < len {
1526            let b = *sp.add(i);
1527            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1528            i += 1;
1529        }
1530    }
1531}
1532
1533/// SIMD-accelerated range translation for mmap'd data.
1534/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
1535/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
1536/// When dst is 32-byte aligned (true for large Vec allocations from mmap), uses
1537/// nontemporal stores to bypass cache, avoiding read-for-ownership overhead and
1538/// reducing memory traffic by ~33% for streaming writes.
1539#[cfg(target_arch = "x86_64")]
1540fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1541    if get_simd_level() >= 3 {
1542        // Use nontemporal stores when dst is 32-byte aligned (typical for large allocs)
1543        if dst.as_ptr() as usize & 31 == 0 {
1544            unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1545        } else {
1546            unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1547        }
1548    } else {
1549        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1550    }
1551}
1552
1553#[cfg(target_arch = "x86_64")]
1554#[target_feature(enable = "avx2")]
1555unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1556    use std::arch::x86_64::*;
1557
1558    unsafe {
1559        let range = hi - lo;
1560        // Bias: shift range so lo maps to -128 (signed min).
1561        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
1562        // For input < lo: biased wraps to large positive (signed), > threshold.
1563        // For input > hi: biased > -128+range, > threshold.
1564        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1565        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1566        let offset_v = _mm256_set1_epi8(offset);
1567        let zero = _mm256_setzero_si256();
1568
1569        let len = src.len();
1570        let sp = src.as_ptr();
1571        let dp = dst.as_mut_ptr();
1572        let mut i = 0;
1573
1574        // 2x unrolled: process 64 bytes per iteration for better ILP.
1575        // Load/compute on the second vector while the first is in-flight.
1576        while i + 64 <= len {
1577            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1578            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1579            let bi0 = _mm256_add_epi8(in0, bias_v);
1580            let bi1 = _mm256_add_epi8(in1, bias_v);
1581            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1582            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1583            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1584            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1585            let om0 = _mm256_and_si256(m0, offset_v);
1586            let om1 = _mm256_and_si256(m1, offset_v);
1587            let r0 = _mm256_add_epi8(in0, om0);
1588            let r1 = _mm256_add_epi8(in1, om1);
1589            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1590            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1591            i += 64;
1592        }
1593
1594        // Remaining 32-byte chunk
1595        if i + 32 <= len {
1596            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1597            let biased = _mm256_add_epi8(input, bias_v);
1598            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1599            let mask = _mm256_cmpeq_epi8(gt, zero);
1600            let offset_masked = _mm256_and_si256(mask, offset_v);
1601            let result = _mm256_add_epi8(input, offset_masked);
1602            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1603            i += 32;
1604        }
1605
1606        // SSE2 tail for 16-byte remainder
1607        if i + 16 <= len {
1608            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1609            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1610            let offset_v128 = _mm_set1_epi8(offset);
1611            let zero128 = _mm_setzero_si128();
1612
1613            let input = _mm_loadu_si128(sp.add(i) as *const _);
1614            let biased = _mm_add_epi8(input, bias_v128);
1615            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1616            let mask = _mm_cmpeq_epi8(gt, zero128);
1617            let offset_masked = _mm_and_si128(mask, offset_v128);
1618            let result = _mm_add_epi8(input, offset_masked);
1619            _mm_storeu_si128(dp.add(i) as *mut _, result);
1620            i += 16;
1621        }
1622
1623        // Scalar tail
1624        while i < len {
1625            let b = *sp.add(i);
1626            *dp.add(i) = if b >= lo && b <= hi {
1627                b.wrapping_add(offset as u8)
1628            } else {
1629                b
1630            };
1631            i += 1;
1632        }
1633    }
1634}
1635
1636/// Nontemporal variant of translate_range_avx2: uses _mm256_stream_si256 for stores.
1637/// This bypasses the cache for writes, avoiding read-for-ownership (RFO) traffic on
1638/// the destination buffer. For streaming translate (src → dst, dst not read again),
1639/// this reduces memory traffic by ~33% (10MB input: 20MB vs 30MB total traffic).
1640/// Requires dst to be 32-byte aligned (guaranteed for large Vec/mmap allocations).
1641#[cfg(target_arch = "x86_64")]
1642#[target_feature(enable = "avx2")]
1643unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1644    use std::arch::x86_64::*;
1645
1646    unsafe {
1647        let range = hi - lo;
1648        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1649        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1650        let offset_v = _mm256_set1_epi8(offset);
1651        let zero = _mm256_setzero_si256();
1652
1653        let len = src.len();
1654        let sp = src.as_ptr();
1655        let dp = dst.as_mut_ptr();
1656        let mut i = 0;
1657
1658        // 2x unrolled with nontemporal stores
1659        while i + 64 <= len {
1660            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1661            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1662            let bi0 = _mm256_add_epi8(in0, bias_v);
1663            let bi1 = _mm256_add_epi8(in1, bias_v);
1664            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1665            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1666            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1667            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1668            let om0 = _mm256_and_si256(m0, offset_v);
1669            let om1 = _mm256_and_si256(m1, offset_v);
1670            let r0 = _mm256_add_epi8(in0, om0);
1671            let r1 = _mm256_add_epi8(in1, om1);
1672            _mm256_stream_si256(dp.add(i) as *mut _, r0);
1673            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1674            i += 64;
1675        }
1676
1677        // Remaining 32-byte chunk (still nontemporal if aligned)
1678        if i + 32 <= len {
1679            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1680            let biased = _mm256_add_epi8(input, bias_v);
1681            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1682            let mask = _mm256_cmpeq_epi8(gt, zero);
1683            let offset_masked = _mm256_and_si256(mask, offset_v);
1684            let result = _mm256_add_epi8(input, offset_masked);
1685            _mm256_stream_si256(dp.add(i) as *mut _, result);
1686            i += 32;
1687        }
1688
1689        // SSE2 tail for 16-byte remainder (regular store — only 16 bytes)
1690        if i + 16 <= len {
1691            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1692            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1693            let offset_v128 = _mm_set1_epi8(offset);
1694            let zero128 = _mm_setzero_si128();
1695
1696            let input = _mm_loadu_si128(sp.add(i) as *const _);
1697            let biased = _mm_add_epi8(input, bias_v128);
1698            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1699            let mask = _mm_cmpeq_epi8(gt, zero128);
1700            let offset_masked = _mm_and_si128(mask, offset_v128);
1701            let result = _mm_add_epi8(input, offset_masked);
1702            _mm_storeu_si128(dp.add(i) as *mut _, result);
1703            i += 16;
1704        }
1705
1706        // Scalar tail
1707        while i < len {
1708            let b = *sp.add(i);
1709            *dp.add(i) = if b >= lo && b <= hi {
1710                b.wrapping_add(offset as u8)
1711            } else {
1712                b
1713            };
1714            i += 1;
1715        }
1716
1717        // Fence: ensure nontemporal stores are visible before write() syscall
1718        _mm_sfence();
1719    }
1720}
1721
1722#[cfg(target_arch = "x86_64")]
1723#[target_feature(enable = "sse2")]
1724unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1725    use std::arch::x86_64::*;
1726
1727    unsafe {
1728        let range = hi - lo;
1729        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1730        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1731        let offset_v = _mm_set1_epi8(offset);
1732        let zero = _mm_setzero_si128();
1733
1734        let len = src.len();
1735        let mut i = 0;
1736
1737        while i + 16 <= len {
1738            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1739            let biased = _mm_add_epi8(input, bias_v);
1740            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1741            let mask = _mm_cmpeq_epi8(gt, zero);
1742            let offset_masked = _mm_and_si128(mask, offset_v);
1743            let result = _mm_add_epi8(input, offset_masked);
1744            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1745            i += 16;
1746        }
1747
1748        while i < len {
1749            let b = *src.get_unchecked(i);
1750            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1751                b.wrapping_add(offset as u8)
1752            } else {
1753                b
1754            };
1755            i += 1;
1756        }
1757    }
1758}
1759
1760/// ARM64 NEON-accelerated range translation.
1761/// Processes 16 bytes per iteration using vectorized range check + conditional add.
1762#[cfg(target_arch = "aarch64")]
1763fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1764    unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1765}
1766
1767#[cfg(target_arch = "aarch64")]
1768#[target_feature(enable = "neon")]
1769unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1770    use std::arch::aarch64::*;
1771
1772    unsafe {
1773        let len = src.len();
1774        let sp = src.as_ptr();
1775        let dp = dst.as_mut_ptr();
1776        let lo_v = vdupq_n_u8(lo);
1777        let hi_v = vdupq_n_u8(hi);
1778        let offset_v = vdupq_n_s8(offset);
1779        let mut i = 0;
1780
1781        // 2x unrolled: process 32 bytes per iteration
1782        while i + 32 <= len {
1783            let in0 = vld1q_u8(sp.add(i));
1784            let in1 = vld1q_u8(sp.add(i + 16));
1785            // Range check: (b >= lo) & (b <= hi)
1786            let ge0 = vcgeq_u8(in0, lo_v);
1787            let le0 = vcleq_u8(in0, hi_v);
1788            let mask0 = vandq_u8(ge0, le0);
1789            let ge1 = vcgeq_u8(in1, lo_v);
1790            let le1 = vcleq_u8(in1, hi_v);
1791            let mask1 = vandq_u8(ge1, le1);
1792            // Conditional add: in + (offset & mask)
1793            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1794            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1795            let r0 = vaddq_u8(in0, off0);
1796            let r1 = vaddq_u8(in1, off1);
1797            vst1q_u8(dp.add(i), r0);
1798            vst1q_u8(dp.add(i + 16), r1);
1799            i += 32;
1800        }
1801
1802        if i + 16 <= len {
1803            let input = vld1q_u8(sp.add(i));
1804            let ge = vcgeq_u8(input, lo_v);
1805            let le = vcleq_u8(input, hi_v);
1806            let mask = vandq_u8(ge, le);
1807            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1808            vst1q_u8(dp.add(i), vaddq_u8(input, off));
1809            i += 16;
1810        }
1811
1812        while i < len {
1813            let b = *sp.add(i);
1814            *dp.add(i) = if b >= lo && b <= hi {
1815                b.wrapping_add(offset as u8)
1816            } else {
1817                b
1818            };
1819            i += 1;
1820        }
1821    }
1822}
1823
1824/// Scalar range translation fallback for non-x86_64, non-aarch64 platforms.
1825#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1826fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1827    let offset_u8 = offset as u8;
1828    let range = hi.wrapping_sub(lo);
1829    unsafe {
1830        let sp = src.as_ptr();
1831        let dp = dst.as_mut_ptr();
1832        let len = src.len();
1833        let mut i = 0;
1834        while i + 8 <= len {
1835            macro_rules! do_byte {
1836                ($off:expr) => {{
1837                    let b = *sp.add(i + $off);
1838                    let in_range = b.wrapping_sub(lo) <= range;
1839                    *dp.add(i + $off) = if in_range {
1840                        b.wrapping_add(offset_u8)
1841                    } else {
1842                        b
1843                    };
1844                }};
1845            }
1846            do_byte!(0);
1847            do_byte!(1);
1848            do_byte!(2);
1849            do_byte!(3);
1850            do_byte!(4);
1851            do_byte!(5);
1852            do_byte!(6);
1853            do_byte!(7);
1854            i += 8;
1855        }
1856        while i < len {
1857            let b = *sp.add(i);
1858            let in_range = b.wrapping_sub(lo) <= range;
1859            *dp.add(i) = if in_range {
1860                b.wrapping_add(offset_u8)
1861            } else {
1862                b
1863            };
1864            i += 1;
1865        }
1866    }
1867}
1868
1869// ============================================================================
1870// In-place SIMD range translation (saves one buffer allocation in streaming)
1871// ============================================================================
1872
1873/// In-place SIMD-accelerated range translation.
1874/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
1875/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
1876#[cfg(target_arch = "x86_64")]
1877fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1878    if get_simd_level() >= 3 {
1879        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1880    } else {
1881        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1882    }
1883}
1884
1885#[cfg(target_arch = "x86_64")]
1886#[target_feature(enable = "avx2")]
1887unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1888    use std::arch::x86_64::*;
1889
1890    unsafe {
1891        let range = hi - lo;
1892        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1893        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1894        let offset_v = _mm256_set1_epi8(offset);
1895        let zero = _mm256_setzero_si256();
1896
1897        let len = data.len();
1898        let ptr = data.as_mut_ptr();
1899        let mut i = 0;
1900
1901        // 2x unrolled: process 64 bytes per iteration for better ILP
1902        while i + 64 <= len {
1903            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1904            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1905            let bi0 = _mm256_add_epi8(in0, bias_v);
1906            let bi1 = _mm256_add_epi8(in1, bias_v);
1907            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1908            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1909            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1910            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1911            let om0 = _mm256_and_si256(m0, offset_v);
1912            let om1 = _mm256_and_si256(m1, offset_v);
1913            let r0 = _mm256_add_epi8(in0, om0);
1914            let r1 = _mm256_add_epi8(in1, om1);
1915            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1916            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1917            i += 64;
1918        }
1919
1920        // Remaining 32-byte chunk
1921        if i + 32 <= len {
1922            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1923            let biased = _mm256_add_epi8(input, bias_v);
1924            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1925            let mask = _mm256_cmpeq_epi8(gt, zero);
1926            let offset_masked = _mm256_and_si256(mask, offset_v);
1927            let result = _mm256_add_epi8(input, offset_masked);
1928            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1929            i += 32;
1930        }
1931
1932        if i + 16 <= len {
1933            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1934            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1935            let offset_v128 = _mm_set1_epi8(offset);
1936            let zero128 = _mm_setzero_si128();
1937
1938            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1939            let biased = _mm_add_epi8(input, bias_v128);
1940            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1941            let mask = _mm_cmpeq_epi8(gt, zero128);
1942            let offset_masked = _mm_and_si128(mask, offset_v128);
1943            let result = _mm_add_epi8(input, offset_masked);
1944            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1945            i += 16;
1946        }
1947
1948        while i < len {
1949            let b = *ptr.add(i);
1950            *ptr.add(i) = if b >= lo && b <= hi {
1951                b.wrapping_add(offset as u8)
1952            } else {
1953                b
1954            };
1955            i += 1;
1956        }
1957    }
1958}
1959
1960#[cfg(target_arch = "x86_64")]
1961#[target_feature(enable = "sse2")]
1962unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1963    use std::arch::x86_64::*;
1964
1965    unsafe {
1966        let range = hi - lo;
1967        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1968        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1969        let offset_v = _mm_set1_epi8(offset);
1970        let zero = _mm_setzero_si128();
1971
1972        let len = data.len();
1973        let ptr = data.as_mut_ptr();
1974        let mut i = 0;
1975
1976        while i + 16 <= len {
1977            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1978            let biased = _mm_add_epi8(input, bias_v);
1979            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1980            let mask = _mm_cmpeq_epi8(gt, zero);
1981            let offset_masked = _mm_and_si128(mask, offset_v);
1982            let result = _mm_add_epi8(input, offset_masked);
1983            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1984            i += 16;
1985        }
1986
1987        while i < len {
1988            let b = *ptr.add(i);
1989            *ptr.add(i) = if b >= lo && b <= hi {
1990                b.wrapping_add(offset as u8)
1991            } else {
1992                b
1993            };
1994            i += 1;
1995        }
1996    }
1997}
1998
1999#[cfg(target_arch = "aarch64")]
2000fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2001    unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2002}
2003
2004#[cfg(target_arch = "aarch64")]
2005#[target_feature(enable = "neon")]
2006unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2007    use std::arch::aarch64::*;
2008
2009    unsafe {
2010        let len = data.len();
2011        let ptr = data.as_mut_ptr();
2012        let lo_v = vdupq_n_u8(lo);
2013        let hi_v = vdupq_n_u8(hi);
2014        let offset_v = vdupq_n_s8(offset);
2015        let mut i = 0;
2016
2017        while i + 32 <= len {
2018            let in0 = vld1q_u8(ptr.add(i));
2019            let in1 = vld1q_u8(ptr.add(i + 16));
2020            let ge0 = vcgeq_u8(in0, lo_v);
2021            let le0 = vcleq_u8(in0, hi_v);
2022            let mask0 = vandq_u8(ge0, le0);
2023            let ge1 = vcgeq_u8(in1, lo_v);
2024            let le1 = vcleq_u8(in1, hi_v);
2025            let mask1 = vandq_u8(ge1, le1);
2026            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2027            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2028            vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2029            vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2030            i += 32;
2031        }
2032
2033        if i + 16 <= len {
2034            let input = vld1q_u8(ptr.add(i));
2035            let ge = vcgeq_u8(input, lo_v);
2036            let le = vcleq_u8(input, hi_v);
2037            let mask = vandq_u8(ge, le);
2038            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2039            vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2040            i += 16;
2041        }
2042
2043        while i < len {
2044            let b = *ptr.add(i);
2045            if b >= lo && b <= hi {
2046                *ptr.add(i) = b.wrapping_add(offset as u8);
2047            }
2048            i += 1;
2049        }
2050    }
2051}
2052
2053#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2054fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2055    let offset_u8 = offset as u8;
2056    let range = hi.wrapping_sub(lo);
2057    for b in data.iter_mut() {
2058        if b.wrapping_sub(lo) <= range {
2059            *b = b.wrapping_add(offset_u8);
2060        }
2061    }
2062}
2063
2064// ============================================================================
2065// SIMD range deletion (x86_64)
2066// ============================================================================
2067
2068/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
2069/// Returns Some((lo, hi)) if so. This is true for common classes:
2070/// - `[:digit:]` = 0x30..=0x39
2071/// - `a-z` = 0x61..=0x7A
2072/// - `A-Z` = 0x41..=0x5A
2073#[inline]
2074fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2075    if chars.is_empty() {
2076        return None;
2077    }
2078    let mut lo = chars[0];
2079    let mut hi = chars[0];
2080    for &c in &chars[1..] {
2081        if c < lo {
2082            lo = c;
2083        }
2084        if c > hi {
2085            hi = c;
2086        }
2087    }
2088    // Check that the range size matches the number of chars (no gaps)
2089    if (hi - lo + 1) as usize == chars.len() {
2090        Some((lo, hi))
2091    } else {
2092        None
2093    }
2094}
2095
2096/// SIMD-accelerated delete for contiguous byte ranges.
2097/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
2098/// then compacts output by skipping matched bytes.
2099#[cfg(target_arch = "x86_64")]
2100fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101    if get_simd_level() >= 3 {
2102        unsafe { delete_range_avx2(src, dst, lo, hi) }
2103    } else {
2104        unsafe { delete_range_sse2(src, dst, lo, hi) }
2105    }
2106}
2107
2108#[cfg(target_arch = "x86_64")]
2109#[target_feature(enable = "avx2")]
2110unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2111    use std::arch::x86_64::*;
2112
2113    unsafe {
2114        let range = hi - lo;
2115        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2116        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2117        let zero = _mm256_setzero_si256();
2118
2119        let len = src.len();
2120        let sp = src.as_ptr();
2121        let dp = dst.as_mut_ptr();
2122        let mut ri = 0;
2123        let mut wp = 0;
2124
2125        while ri + 32 <= len {
2126            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2127            let biased = _mm256_add_epi8(input, bias_v);
2128            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
2129            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2130            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
2131            let in_range = _mm256_cmpeq_epi8(gt, zero);
2132            // keep_mask bits: 1 = keep (NOT in range)
2133            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2134
2135            if keep_mask == 0xFFFFFFFF {
2136                // All 32 bytes are kept — bulk copy
2137                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2138                wp += 32;
2139            } else if keep_mask != 0 {
2140                // Partial keep — per-lane processing with all-keep fast paths.
2141                // For 4% delete rate, ~72% of 8-byte lanes are all-keep even
2142                // within partial 32-byte blocks. The per-lane check avoids
2143                // the LUT compact overhead for these clean lanes.
2144                let m0 = keep_mask as u8;
2145                let m1 = (keep_mask >> 8) as u8;
2146                let m2 = (keep_mask >> 16) as u8;
2147                let m3 = (keep_mask >> 24) as u8;
2148
2149                if m0 == 0xFF {
2150                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2151                } else if m0 != 0 {
2152                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2153                }
2154                let c0 = m0.count_ones() as usize;
2155
2156                if m1 == 0xFF {
2157                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2158                } else if m1 != 0 {
2159                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2160                }
2161                let c1 = m1.count_ones() as usize;
2162
2163                if m2 == 0xFF {
2164                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2165                } else if m2 != 0 {
2166                    compact_8bytes(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2167                }
2168                let c2 = m2.count_ones() as usize;
2169
2170                if m3 == 0xFF {
2171                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2172                } else if m3 != 0 {
2173                    compact_8bytes(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2174                }
2175                let c3 = m3.count_ones() as usize;
2176                wp += c0 + c1 + c2 + c3;
2177            }
2178            // else: keep_mask == 0 means all bytes deleted, skip entirely
2179            ri += 32;
2180        }
2181
2182        // SSE2 tail for 16-byte remainder
2183        if ri + 16 <= len {
2184            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2185            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2186            let zero128 = _mm_setzero_si128();
2187
2188            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2189            let biased = _mm_add_epi8(input, bias_v128);
2190            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2191            let in_range = _mm_cmpeq_epi8(gt, zero128);
2192            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2193
2194            if keep_mask == 0xFFFF {
2195                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2196                wp += 16;
2197            } else if keep_mask != 0 {
2198                let m0 = keep_mask as u8;
2199                let m1 = (keep_mask >> 8) as u8;
2200                if m0 == 0xFF {
2201                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2202                } else if m0 != 0 {
2203                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2204                }
2205                let c0 = m0.count_ones() as usize;
2206                if m1 == 0xFF {
2207                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2208                } else if m1 != 0 {
2209                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2210                }
2211                wp += c0 + m1.count_ones() as usize;
2212            }
2213            ri += 16;
2214        }
2215
2216        // Scalar tail — branchless: always store, advance wp only for kept bytes
2217        while ri < len {
2218            let b = *sp.add(ri);
2219            *dp.add(wp) = b;
2220            wp += (b < lo || b > hi) as usize;
2221            ri += 1;
2222        }
2223
2224        wp
2225    }
2226}
2227
2228/// Compact 8 source bytes into contiguous output bytes using a keep mask.
2229/// Each bit in `mask` indicates whether the corresponding byte should be kept.
2230/// Uses a precomputed LUT: for each 8-bit mask, the LUT stores indices of set bits.
2231/// Always performs 8 unconditional stores (extra stores past popcount are harmless
2232/// since the write pointer only advances by popcount, and subsequent lanes overwrite).
2233/// This eliminates the serial tzcnt→blsr dependency chain (~28 cycles) in favor of
2234/// independent indexed loads and stores (~15 cycles).
2235#[cfg(target_arch = "x86_64")]
2236#[inline(always)]
2237unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2238    unsafe {
2239        let idx = COMPACT_LUT.get_unchecked(mask as usize);
2240        *dst = *src.add(*idx.get_unchecked(0) as usize);
2241        *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2242        *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2243        *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2244        *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2245        *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2246        *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2247        *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2248    }
2249}
2250
2251#[cfg(target_arch = "x86_64")]
2252#[target_feature(enable = "sse2")]
2253unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2254    use std::arch::x86_64::*;
2255
2256    unsafe {
2257        let range = hi - lo;
2258        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2259        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2260        let zero = _mm_setzero_si128();
2261
2262        let len = src.len();
2263        let sp = src.as_ptr();
2264        let dp = dst.as_mut_ptr();
2265        let mut ri = 0;
2266        let mut wp = 0;
2267
2268        while ri + 16 <= len {
2269            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2270            let biased = _mm_add_epi8(input, bias_v);
2271            let gt = _mm_cmpgt_epi8(biased, threshold_v);
2272            let in_range = _mm_cmpeq_epi8(gt, zero);
2273            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2274
2275            if keep_mask == 0xFFFF {
2276                // All 16 bytes kept — bulk copy
2277                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2278                wp += 16;
2279            } else if keep_mask != 0 {
2280                let m0 = keep_mask as u8;
2281                let m1 = (keep_mask >> 8) as u8;
2282                if m0 == 0xFF {
2283                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2284                } else if m0 != 0 {
2285                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2286                }
2287                let c0 = m0.count_ones() as usize;
2288                if m1 == 0xFF {
2289                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2290                } else if m1 != 0 {
2291                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2292                }
2293                wp += c0 + m1.count_ones() as usize;
2294            }
2295            ri += 16;
2296        }
2297
2298        // Scalar tail — branchless
2299        while ri < len {
2300            let b = *sp.add(ri);
2301            *dp.add(wp) = b;
2302            wp += (b < lo || b > hi) as usize;
2303            ri += 1;
2304        }
2305
2306        wp
2307    }
2308}
2309
2310/// Branchless range delete fallback for non-x86_64 (ARM64, etc.).
2311/// Unconditional store + conditional pointer advance eliminates branch
2312/// mispredictions. Unrolled 8x for better ILP on out-of-order cores.
2313#[cfg(not(target_arch = "x86_64"))]
2314fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2315    let len = src.len();
2316    let sp = src.as_ptr();
2317    let dp = dst.as_mut_ptr();
2318    let mut wp: usize = 0;
2319    let mut i: usize = 0;
2320
2321    // Unrolled branchless loop — 8 bytes per iteration
2322    while i + 8 <= len {
2323        unsafe {
2324            let b0 = *sp.add(i);
2325            *dp.add(wp) = b0;
2326            wp += (b0 < lo || b0 > hi) as usize;
2327            let b1 = *sp.add(i + 1);
2328            *dp.add(wp) = b1;
2329            wp += (b1 < lo || b1 > hi) as usize;
2330            let b2 = *sp.add(i + 2);
2331            *dp.add(wp) = b2;
2332            wp += (b2 < lo || b2 > hi) as usize;
2333            let b3 = *sp.add(i + 3);
2334            *dp.add(wp) = b3;
2335            wp += (b3 < lo || b3 > hi) as usize;
2336            let b4 = *sp.add(i + 4);
2337            *dp.add(wp) = b4;
2338            wp += (b4 < lo || b4 > hi) as usize;
2339            let b5 = *sp.add(i + 5);
2340            *dp.add(wp) = b5;
2341            wp += (b5 < lo || b5 > hi) as usize;
2342            let b6 = *sp.add(i + 6);
2343            *dp.add(wp) = b6;
2344            wp += (b6 < lo || b6 > hi) as usize;
2345            let b7 = *sp.add(i + 7);
2346            *dp.add(wp) = b7;
2347            wp += (b7 < lo || b7 > hi) as usize;
2348        }
2349        i += 8;
2350    }
2351
2352    // Scalar tail
2353    while i < len {
2354        unsafe {
2355            let b = *sp.add(i);
2356            *dp.add(wp) = b;
2357            wp += (b < lo || b > hi) as usize;
2358        }
2359        i += 1;
2360    }
2361
2362    wp
2363}
2364
2365/// Streaming delete for contiguous byte ranges using SIMD range detection.
2366/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
2367/// When no bytes are deleted from a chunk (common for data with few matches),
2368/// writes directly from the source buffer to avoid the copy overhead.
2369fn delete_range_streaming(
2370    lo: u8,
2371    hi: u8,
2372    reader: &mut impl Read,
2373    writer: &mut impl Write,
2374) -> io::Result<()> {
2375    // Single-buffer in-place delete: eliminates the 16MB dst allocation
2376    // and its ~4000 page faults. For 10MB piped input, saves ~1.2ms.
2377    let mut buf = alloc_uninit_vec(STREAM_BUF);
2378    loop {
2379        let n = read_once(reader, &mut buf)?;
2380        if n == 0 {
2381            break;
2382        }
2383        let wp = delete_range_inplace(&mut buf, n, lo, hi);
2384        if wp > 0 {
2385            writer.write_all(&buf[..wp])?;
2386        }
2387    }
2388    Ok(())
2389}
2390
2391/// In-place range delete: SIMD scan for all-keep blocks + branchless scalar compaction.
2392/// Uses a single buffer — reads at position ri, writes at position wp (wp <= ri always).
2393#[inline]
2394fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2395    #[cfg(target_arch = "x86_64")]
2396    {
2397        let level = get_simd_level();
2398        if level >= 3 {
2399            return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2400        }
2401    }
2402    // Scalar fallback: branchless in-place delete
2403    let ptr = buf.as_mut_ptr();
2404    let mut ri = 0;
2405    let mut wp = 0;
2406    unsafe {
2407        while ri + 8 <= n {
2408            let b0 = *ptr.add(ri);
2409            let b1 = *ptr.add(ri + 1);
2410            let b2 = *ptr.add(ri + 2);
2411            let b3 = *ptr.add(ri + 3);
2412            let b4 = *ptr.add(ri + 4);
2413            let b5 = *ptr.add(ri + 5);
2414            let b6 = *ptr.add(ri + 6);
2415            let b7 = *ptr.add(ri + 7);
2416            *ptr.add(wp) = b0;
2417            wp += (b0 < lo || b0 > hi) as usize;
2418            *ptr.add(wp) = b1;
2419            wp += (b1 < lo || b1 > hi) as usize;
2420            *ptr.add(wp) = b2;
2421            wp += (b2 < lo || b2 > hi) as usize;
2422            *ptr.add(wp) = b3;
2423            wp += (b3 < lo || b3 > hi) as usize;
2424            *ptr.add(wp) = b4;
2425            wp += (b4 < lo || b4 > hi) as usize;
2426            *ptr.add(wp) = b5;
2427            wp += (b5 < lo || b5 > hi) as usize;
2428            *ptr.add(wp) = b6;
2429            wp += (b6 < lo || b6 > hi) as usize;
2430            *ptr.add(wp) = b7;
2431            wp += (b7 < lo || b7 > hi) as usize;
2432            ri += 8;
2433        }
2434        while ri < n {
2435            let b = *ptr.add(ri);
2436            *ptr.add(wp) = b;
2437            wp += (b < lo || b > hi) as usize;
2438            ri += 1;
2439        }
2440    }
2441    wp
2442}
2443
2444/// AVX2 in-place range delete: scan 32 bytes at a time, skip all-keep blocks,
2445/// branchless scalar compaction for mixed blocks.
2446#[cfg(target_arch = "x86_64")]
2447#[target_feature(enable = "avx2")]
2448unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2449    use std::arch::x86_64::*;
2450
2451    unsafe {
2452        let range = hi - lo;
2453        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2454        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2455        let zero = _mm256_setzero_si256();
2456
2457        let ptr = buf.as_mut_ptr();
2458        let mut ri = 0;
2459        let mut wp = 0;
2460
2461        while ri + 32 <= n {
2462            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2463            let biased = _mm256_add_epi8(input, bias_v);
2464            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2465            let in_range = _mm256_cmpeq_epi8(gt, zero);
2466            let del_mask = _mm256_movemask_epi8(in_range) as u32;
2467
2468            if del_mask == 0 {
2469                // All 32 bytes kept
2470                if wp != ri {
2471                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2472                }
2473                wp += 32;
2474            } else if del_mask != 0xFFFFFFFF {
2475                // Mixed block: pshufb-based 8-byte compaction.
2476                // Process 4 × 8-byte sub-chunks using COMPACT_LUT + pshufb.
2477                // Each sub-chunk: load 8 bytes into register (safe for overlap),
2478                // shuffle kept bytes to front, store. 4 SIMD ops vs 32 scalar.
2479                let keep_mask = !del_mask;
2480                let m0 = keep_mask as u8;
2481                let m1 = (keep_mask >> 8) as u8;
2482                let m2 = (keep_mask >> 16) as u8;
2483                let m3 = (keep_mask >> 24) as u8;
2484
2485                let c0 = m0.count_ones() as usize;
2486                let c1 = m1.count_ones() as usize;
2487                let c2 = m2.count_ones() as usize;
2488                let c3 = m3.count_ones() as usize;
2489
2490                // Sub-chunk 0: bytes 0-7
2491                if m0 == 0xFF {
2492                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2493                } else if m0 != 0 {
2494                    let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2495                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2496                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2497                    _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2498                }
2499
2500                // Sub-chunk 1: bytes 8-15
2501                if m1 == 0xFF {
2502                    std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2503                } else if m1 != 0 {
2504                    let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2505                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2506                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2507                    _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2508                }
2509
2510                // Sub-chunk 2: bytes 16-23
2511                if m2 == 0xFF {
2512                    std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2513                } else if m2 != 0 {
2514                    let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2515                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2516                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2517                    _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2518                }
2519
2520                // Sub-chunk 3: bytes 24-31
2521                if m3 == 0xFF {
2522                    std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2523                } else if m3 != 0 {
2524                    let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2525                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2526                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2527                    _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2528                }
2529
2530                wp += c0 + c1 + c2 + c3;
2531            }
2532            // del_mask == 0xFFFFFFFF: all deleted, skip entirely
2533            ri += 32;
2534        }
2535
2536        // Scalar tail
2537        while ri < n {
2538            let b = *ptr.add(ri);
2539            *ptr.add(wp) = b;
2540            wp += (b < lo || b > hi) as usize;
2541            ri += 1;
2542        }
2543
2544        wp
2545    }
2546}
2547
2548// ============================================================================
2549// Streaming functions (Read + Write)
2550// ============================================================================
2551
2552pub fn translate(
2553    set1: &[u8],
2554    set2: &[u8],
2555    reader: &mut impl Read,
2556    writer: &mut impl Write,
2557) -> io::Result<()> {
2558    let table = build_translate_table(set1, set2);
2559
2560    // Check for identity table — pure passthrough (no transformation needed)
2561    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2562    if is_identity {
2563        return passthrough_stream(reader, writer);
2564    }
2565
2566    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
2567    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2568        return translate_range_stream(lo, hi, offset, reader, writer);
2569    }
2570
2571    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
2572    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
2573    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2574        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2575    }
2576
2577    // General case: IN-PLACE translation on a SINGLE 16MB buffer.
2578    // This halves memory bandwidth vs the old separate src/dst approach:
2579    // - Old: read into src, translate from src→dst (read + write), write dst = 12MB bandwidth
2580    // - New: read into buf, translate in-place (read+write), write buf = 8MB bandwidth
2581    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
2582    // because consecutive reads are 8 bytes apart (sequential), not aliased.
2583    // Using 16MB buffer = 1 read for 10MB input, minimizing syscall count.
2584    // SAFETY: all bytes are written by read_once before being translated.
2585    let mut buf = alloc_uninit_vec(STREAM_BUF);
2586    loop {
2587        let n = read_once(reader, &mut buf)?;
2588        if n == 0 {
2589            break;
2590        }
2591        translate_inplace(&mut buf[..n], &table);
2592        writer.write_all(&buf[..n])?;
2593    }
2594    Ok(())
2595}
2596
2597/// Streaming SIMD range translation — single buffer, in-place transform.
2598/// Uses 16MB uninit buffer for fewer syscalls (translate is compute-light).
2599fn translate_range_stream(
2600    lo: u8,
2601    hi: u8,
2602    offset: i8,
2603    reader: &mut impl Read,
2604    writer: &mut impl Write,
2605) -> io::Result<()> {
2606    let mut buf = alloc_uninit_vec(STREAM_BUF);
2607    loop {
2608        let n = read_once(reader, &mut buf)?;
2609        if n == 0 {
2610            break;
2611        }
2612        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2613        writer.write_all(&buf[..n])?;
2614    }
2615    Ok(())
2616}
2617
2618/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
2619/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
2620fn translate_range_to_constant_stream(
2621    lo: u8,
2622    hi: u8,
2623    replacement: u8,
2624    reader: &mut impl Read,
2625    writer: &mut impl Write,
2626) -> io::Result<()> {
2627    let mut buf = alloc_uninit_vec(STREAM_BUF);
2628    loop {
2629        let n = read_once(reader, &mut buf)?;
2630        if n == 0 {
2631            break;
2632        }
2633        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2634        writer.write_all(&buf[..n])?;
2635    }
2636    Ok(())
2637}
2638
2639/// Pure passthrough: copy stdin to stdout without transformation.
2640/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
2641fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2642    let mut buf = alloc_uninit_vec(STREAM_BUF);
2643    loop {
2644        let n = read_once(reader, &mut buf)?;
2645        if n == 0 {
2646            break;
2647        }
2648        writer.write_all(&buf[..n])?;
2649    }
2650    Ok(())
2651}
2652
2653/// Single-read for pipelining: process data immediately after first read()
2654/// instead of blocking to fill the entire buffer. This enables cat|ftr
2655/// pipelining: while ftr processes the first chunk, cat continues writing
2656/// to the pipe. For 10MB piped input with 8MB pipe buffer, this saves
2657/// ~0.5-1ms by overlapping cat's final writes with ftr's processing.
2658#[inline]
2659fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2660    loop {
2661        match reader.read(buf) {
2662            Ok(n) => return Ok(n),
2663            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2664            Err(e) => return Err(e),
2665        }
2666    }
2667}
2668
2669pub fn translate_squeeze(
2670    set1: &[u8],
2671    set2: &[u8],
2672    reader: &mut impl Read,
2673    writer: &mut impl Write,
2674) -> io::Result<()> {
2675    let table = build_translate_table(set1, set2);
2676    let squeeze_set = build_member_set(set2);
2677
2678    // For single-char squeeze set with range-to-constant translation, use
2679    // fused approach: translate via SIMD, then use memmem to find squeeze points.
2680    if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2681        let squeeze_ch = set2.last().copied().unwrap_or(0);
2682        return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2683    }
2684
2685    // Two-pass optimization for range translations:
2686    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
2687    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
2688    let range_info = detect_range_offset(&table);
2689    let range_const_info = if range_info.is_none() {
2690        detect_range_to_constant(&table)
2691    } else {
2692        None
2693    };
2694
2695    let mut buf = alloc_uninit_vec(STREAM_BUF);
2696    let mut last_squeezed: u16 = 256;
2697
2698    loop {
2699        let n = read_once(reader, &mut buf)?;
2700        if n == 0 {
2701            break;
2702        }
2703        // Pass 1: translate
2704        if let Some((lo, hi, offset)) = range_info {
2705            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2706        } else if let Some((lo, hi, replacement)) = range_const_info {
2707            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2708        } else {
2709            translate_inplace(&mut buf[..n], &table);
2710        }
2711        // Pass 2: squeeze in-place using 8x-unrolled loop
2712        let mut wp = 0;
2713        unsafe {
2714            let ptr = buf.as_mut_ptr();
2715            let mut i = 0;
2716            while i + 8 <= n {
2717                macro_rules! squeeze_byte {
2718                    ($off:expr) => {
2719                        let b = *ptr.add(i + $off);
2720                        if is_member(&squeeze_set, b) {
2721                            if last_squeezed != b as u16 {
2722                                last_squeezed = b as u16;
2723                                *ptr.add(wp) = b;
2724                                wp += 1;
2725                            }
2726                        } else {
2727                            last_squeezed = 256;
2728                            *ptr.add(wp) = b;
2729                            wp += 1;
2730                        }
2731                    };
2732                }
2733                squeeze_byte!(0);
2734                squeeze_byte!(1);
2735                squeeze_byte!(2);
2736                squeeze_byte!(3);
2737                squeeze_byte!(4);
2738                squeeze_byte!(5);
2739                squeeze_byte!(6);
2740                squeeze_byte!(7);
2741                i += 8;
2742            }
2743            while i < n {
2744                let b = *ptr.add(i);
2745                if is_member(&squeeze_set, b) {
2746                    if last_squeezed == b as u16 {
2747                        i += 1;
2748                        continue;
2749                    }
2750                    last_squeezed = b as u16;
2751                } else {
2752                    last_squeezed = 256;
2753                }
2754                *ptr.add(wp) = b;
2755                wp += 1;
2756                i += 1;
2757            }
2758        }
2759        writer.write_all(&buf[..wp])?;
2760    }
2761    Ok(())
2762}
2763
2764/// Optimized translate+squeeze for single squeeze character.
2765/// After SIMD translation, uses memmem to find consecutive pairs
2766/// and compacts in-place with a single write_all per chunk.
2767fn translate_squeeze_single_ch(
2768    table: &[u8; 256],
2769    squeeze_ch: u8,
2770    _squeeze_set: &[u8; 32],
2771    reader: &mut impl Read,
2772    writer: &mut impl Write,
2773) -> io::Result<()> {
2774    let range_info = detect_range_offset(table);
2775    let range_const_info = if range_info.is_none() {
2776        detect_range_to_constant(table)
2777    } else {
2778        None
2779    };
2780
2781    let pair = [squeeze_ch, squeeze_ch];
2782    let finder = memchr::memmem::Finder::new(&pair);
2783    let mut buf = alloc_uninit_vec(STREAM_BUF);
2784    let mut was_squeeze_char = false;
2785
2786    loop {
2787        let n = read_once(reader, &mut buf)?;
2788        if n == 0 {
2789            break;
2790        }
2791        // Pass 1: SIMD translate in-place
2792        if let Some((lo, hi, offset)) = range_info {
2793            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2794        } else if let Some((lo, hi, replacement)) = range_const_info {
2795            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2796        } else {
2797            translate_inplace(&mut buf[..n], table);
2798        }
2799
2800        // Pass 2: in-place squeeze compaction
2801        let mut i = 0;
2802
2803        // Handle carry-over from previous chunk
2804        if was_squeeze_char {
2805            while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2806                i += 1;
2807            }
2808            if i >= n {
2809                continue;
2810            }
2811        }
2812
2813        let ptr = buf.as_mut_ptr();
2814        let mut wp = 0usize;
2815
2816        loop {
2817            match finder.find(&buf[i..n]) {
2818                Some(offset) => {
2819                    let seg_end = i + offset + 1;
2820                    let gap = seg_end - i;
2821                    if gap > 0 {
2822                        if wp != i {
2823                            unsafe {
2824                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2825                            }
2826                        }
2827                        wp += gap;
2828                    }
2829                    i = seg_end;
2830                    while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2831                        i += 1;
2832                    }
2833                    if i >= n {
2834                        was_squeeze_char = true;
2835                        break;
2836                    }
2837                }
2838                None => {
2839                    let rem = n - i;
2840                    if rem > 0 {
2841                        if wp != i {
2842                            unsafe {
2843                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2844                            }
2845                        }
2846                        wp += rem;
2847                    }
2848                    was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2849                    break;
2850                }
2851            }
2852        }
2853
2854        if wp > 0 {
2855            writer.write_all(&buf[..wp])?;
2856        }
2857    }
2858    Ok(())
2859}
2860
2861pub fn delete(
2862    delete_chars: &[u8],
2863    reader: &mut impl Read,
2864    writer: &mut impl Write,
2865) -> io::Result<()> {
2866    if delete_chars.len() == 1 {
2867        return delete_single_streaming(delete_chars[0], reader, writer);
2868    }
2869    if delete_chars.len() <= 3 {
2870        return delete_multi_streaming(delete_chars, reader, writer);
2871    }
2872
2873    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
2874    // use vectorized range comparison instead of scalar bitset lookup.
2875    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
2876    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2877        return delete_range_streaming(lo, hi, reader, writer);
2878    }
2879
2880    let member = build_member_set(delete_chars);
2881    let mut buf = alloc_uninit_vec(STREAM_BUF);
2882
2883    loop {
2884        let n = read_once(reader, &mut buf)?;
2885        if n == 0 {
2886            break;
2887        }
2888        let mut wp = 0;
2889        unsafe {
2890            let ptr = buf.as_mut_ptr();
2891            let mut i = 0;
2892            while i + 8 <= n {
2893                let b0 = *ptr.add(i);
2894                let b1 = *ptr.add(i + 1);
2895                let b2 = *ptr.add(i + 2);
2896                let b3 = *ptr.add(i + 3);
2897                let b4 = *ptr.add(i + 4);
2898                let b5 = *ptr.add(i + 5);
2899                let b6 = *ptr.add(i + 6);
2900                let b7 = *ptr.add(i + 7);
2901
2902                // Branchless: write byte then conditionally advance pointer.
2903                // Avoids branch mispredictions when most bytes are kept.
2904                *ptr.add(wp) = b0;
2905                wp += !is_member(&member, b0) as usize;
2906                *ptr.add(wp) = b1;
2907                wp += !is_member(&member, b1) as usize;
2908                *ptr.add(wp) = b2;
2909                wp += !is_member(&member, b2) as usize;
2910                *ptr.add(wp) = b3;
2911                wp += !is_member(&member, b3) as usize;
2912                *ptr.add(wp) = b4;
2913                wp += !is_member(&member, b4) as usize;
2914                *ptr.add(wp) = b5;
2915                wp += !is_member(&member, b5) as usize;
2916                *ptr.add(wp) = b6;
2917                wp += !is_member(&member, b6) as usize;
2918                *ptr.add(wp) = b7;
2919                wp += !is_member(&member, b7) as usize;
2920                i += 8;
2921            }
2922            while i < n {
2923                let b = *ptr.add(i);
2924                *ptr.add(wp) = b;
2925                wp += !is_member(&member, b) as usize;
2926                i += 1;
2927            }
2928        }
2929        writer.write_all(&buf[..wp])?;
2930    }
2931    Ok(())
2932}
2933
2934fn delete_single_streaming(
2935    ch: u8,
2936    reader: &mut impl Read,
2937    writer: &mut impl Write,
2938) -> io::Result<()> {
2939    // Single-buffer in-place delete: memchr finds delete positions,
2940    // gap-copy backward in the same buffer. Saves 16MB dst allocation.
2941    let mut buf = alloc_uninit_vec(STREAM_BUF);
2942    loop {
2943        let n = read_once(reader, &mut buf)?;
2944        if n == 0 {
2945            break;
2946        }
2947        let mut wp = 0;
2948        let mut i = 0;
2949        while i < n {
2950            match memchr::memchr(ch, &buf[i..n]) {
2951                Some(offset) => {
2952                    if offset > 0 {
2953                        if wp != i {
2954                            unsafe {
2955                                std::ptr::copy(
2956                                    buf.as_ptr().add(i),
2957                                    buf.as_mut_ptr().add(wp),
2958                                    offset,
2959                                );
2960                            }
2961                        }
2962                        wp += offset;
2963                    }
2964                    i += offset + 1;
2965                }
2966                None => {
2967                    let run_len = n - i;
2968                    if run_len > 0 {
2969                        if wp != i {
2970                            unsafe {
2971                                std::ptr::copy(
2972                                    buf.as_ptr().add(i),
2973                                    buf.as_mut_ptr().add(wp),
2974                                    run_len,
2975                                );
2976                            }
2977                        }
2978                        wp += run_len;
2979                    }
2980                    break;
2981                }
2982            }
2983        }
2984        if wp > 0 {
2985            writer.write_all(&buf[..wp])?;
2986        }
2987    }
2988    Ok(())
2989}
2990
2991fn delete_multi_streaming(
2992    chars: &[u8],
2993    reader: &mut impl Read,
2994    writer: &mut impl Write,
2995) -> io::Result<()> {
2996    // Single-buffer in-place delete: memchr2/memchr3 finds delete positions,
2997    // gap-copy backward in the same buffer. Saves 16MB dst allocation.
2998    let mut buf = alloc_uninit_vec(STREAM_BUF);
2999    loop {
3000        let n = read_once(reader, &mut buf)?;
3001        if n == 0 {
3002            break;
3003        }
3004        let mut wp = 0;
3005        let mut i = 0;
3006        while i < n {
3007            let found = if chars.len() == 2 {
3008                memchr::memchr2(chars[0], chars[1], &buf[i..n])
3009            } else {
3010                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3011            };
3012            match found {
3013                Some(offset) => {
3014                    if offset > 0 {
3015                        if wp != i {
3016                            unsafe {
3017                                std::ptr::copy(
3018                                    buf.as_ptr().add(i),
3019                                    buf.as_mut_ptr().add(wp),
3020                                    offset,
3021                                );
3022                            }
3023                        }
3024                        wp += offset;
3025                    }
3026                    i += offset + 1;
3027                }
3028                None => {
3029                    let run_len = n - i;
3030                    if run_len > 0 {
3031                        if wp != i {
3032                            unsafe {
3033                                std::ptr::copy(
3034                                    buf.as_ptr().add(i),
3035                                    buf.as_mut_ptr().add(wp),
3036                                    run_len,
3037                                );
3038                            }
3039                        }
3040                        wp += run_len;
3041                    }
3042                    break;
3043                }
3044            }
3045        }
3046        if wp > 0 {
3047            writer.write_all(&buf[..wp])?;
3048        }
3049    }
3050    Ok(())
3051}
3052
3053pub fn delete_squeeze(
3054    delete_chars: &[u8],
3055    squeeze_chars: &[u8],
3056    reader: &mut impl Read,
3057    writer: &mut impl Write,
3058) -> io::Result<()> {
3059    let delete_set = build_member_set(delete_chars);
3060    let squeeze_set = build_member_set(squeeze_chars);
3061    let mut buf = alloc_uninit_vec(STREAM_BUF);
3062    let mut last_squeezed: u16 = 256;
3063
3064    loop {
3065        let n = read_once(reader, &mut buf)?;
3066        if n == 0 {
3067            break;
3068        }
3069        // Fused delete+squeeze: 8x-unrolled inner loop for better ILP.
3070        // Each byte is checked against delete set first (skip if member),
3071        // then squeeze set (deduplicate consecutive members).
3072        let mut wp = 0;
3073        unsafe {
3074            let ptr = buf.as_mut_ptr();
3075            let mut i = 0;
3076            while i + 8 <= n {
3077                macro_rules! process_byte {
3078                    ($off:expr) => {
3079                        let b = *ptr.add(i + $off);
3080                        if !is_member(&delete_set, b) {
3081                            if is_member(&squeeze_set, b) {
3082                                if last_squeezed != b as u16 {
3083                                    last_squeezed = b as u16;
3084                                    *ptr.add(wp) = b;
3085                                    wp += 1;
3086                                }
3087                            } else {
3088                                last_squeezed = 256;
3089                                *ptr.add(wp) = b;
3090                                wp += 1;
3091                            }
3092                        }
3093                    };
3094                }
3095                process_byte!(0);
3096                process_byte!(1);
3097                process_byte!(2);
3098                process_byte!(3);
3099                process_byte!(4);
3100                process_byte!(5);
3101                process_byte!(6);
3102                process_byte!(7);
3103                i += 8;
3104            }
3105            while i < n {
3106                let b = *ptr.add(i);
3107                if !is_member(&delete_set, b) {
3108                    if is_member(&squeeze_set, b) {
3109                        if last_squeezed != b as u16 {
3110                            last_squeezed = b as u16;
3111                            *ptr.add(wp) = b;
3112                            wp += 1;
3113                        }
3114                    } else {
3115                        last_squeezed = 256;
3116                        *ptr.add(wp) = b;
3117                        wp += 1;
3118                    }
3119                }
3120                i += 1;
3121            }
3122        }
3123        writer.write_all(&buf[..wp])?;
3124    }
3125    Ok(())
3126}
3127
3128pub fn squeeze(
3129    squeeze_chars: &[u8],
3130    reader: &mut impl Read,
3131    writer: &mut impl Write,
3132) -> io::Result<()> {
3133    if squeeze_chars.len() == 1 {
3134        return squeeze_single_stream(squeeze_chars[0], reader, writer);
3135    }
3136
3137    // For 2-3 squeeze chars, use memchr2/memchr3-based gap-copy
3138    // which gives SIMD-accelerated scanning instead of byte-at-a-time.
3139    if squeeze_chars.len() <= 3 {
3140        return squeeze_multi_stream(squeeze_chars, reader, writer);
3141    }
3142
3143    let member = build_member_set(squeeze_chars);
3144    let mut buf = alloc_uninit_vec(STREAM_BUF);
3145    let mut last_squeezed: u16 = 256;
3146
3147    loop {
3148        let n = read_once(reader, &mut buf)?;
3149        if n == 0 {
3150            break;
3151        }
3152        let mut wp = 0;
3153        unsafe {
3154            let ptr = buf.as_mut_ptr();
3155            for i in 0..n {
3156                let b = *ptr.add(i);
3157                if is_member(&member, b) {
3158                    if last_squeezed == b as u16 {
3159                        continue;
3160                    }
3161                    last_squeezed = b as u16;
3162                } else {
3163                    last_squeezed = 256;
3164                }
3165                *ptr.add(wp) = b;
3166                wp += 1;
3167            }
3168        }
3169        writer.write_all(&buf[..wp])?;
3170    }
3171    Ok(())
3172}
3173
3174/// Streaming squeeze for 2-3 chars using memchr2/memchr3 SIMD scanning.
3175/// Builds writev IoSlice entries pointing into the read buffer, skipping
3176/// duplicate runs of squeezable characters. Zero-copy between squeeze points.
3177fn squeeze_multi_stream(
3178    chars: &[u8],
3179    reader: &mut impl Read,
3180    writer: &mut impl Write,
3181) -> io::Result<()> {
3182    let c0 = chars[0];
3183    let c1 = chars[1];
3184    let c2 = if chars.len() >= 3 {
3185        Some(chars[2])
3186    } else {
3187        None
3188    };
3189    let single_byte = [0u8; 1]; // used for the kept single byte
3190    let _ = single_byte;
3191
3192    let mut buf = alloc_uninit_vec(STREAM_BUF);
3193    let mut last_squeezed: u16 = 256;
3194
3195    loop {
3196        let n = read_once(reader, &mut buf)?;
3197        if n == 0 {
3198            break;
3199        }
3200
3201        // In-place compaction using memchr2/memchr3 gap-copy.
3202        // For each squeezable char found, copy the gap before it,
3203        // then emit one byte (if not a squeeze duplicate) and skip the run.
3204        let ptr = buf.as_mut_ptr();
3205        let mut wp = 0usize;
3206        let mut cursor = 0usize;
3207
3208        macro_rules! find_next {
3209            ($start:expr) => {
3210                if let Some(c) = c2 {
3211                    memchr::memchr3(c0, c1, c, &buf[$start..n])
3212                } else {
3213                    memchr::memchr2(c0, c1, &buf[$start..n])
3214                }
3215            };
3216        }
3217
3218        while cursor < n {
3219            match find_next!(cursor) {
3220                Some(offset) => {
3221                    let pos = cursor + offset;
3222                    let b = unsafe { *ptr.add(pos) };
3223
3224                    // Copy gap before squeeze point
3225                    let gap = pos - cursor;
3226                    if gap > 0 {
3227                        if wp != cursor {
3228                            unsafe {
3229                                std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3230                            }
3231                        }
3232                        wp += gap;
3233                        last_squeezed = 256;
3234                    }
3235
3236                    // Emit single byte if not duplicate
3237                    if last_squeezed != b as u16 {
3238                        unsafe { *ptr.add(wp) = b };
3239                        wp += 1;
3240                        last_squeezed = b as u16;
3241                    }
3242
3243                    // Skip the run of same byte
3244                    cursor = pos + 1;
3245                    while cursor < n && unsafe { *ptr.add(cursor) } == b {
3246                        cursor += 1;
3247                    }
3248                }
3249                None => {
3250                    // No more squeeze chars — copy remainder
3251                    let rem = n - cursor;
3252                    if rem > 0 {
3253                        if wp != cursor {
3254                            unsafe {
3255                                std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3256                            }
3257                        }
3258                        wp += rem;
3259                        last_squeezed = 256;
3260                    }
3261                    break;
3262                }
3263            }
3264        }
3265
3266        writer.write_all(&buf[..wp])?;
3267    }
3268    Ok(())
3269}
3270
3271fn squeeze_single_stream(
3272    ch: u8,
3273    reader: &mut impl Read,
3274    writer: &mut impl Write,
3275) -> io::Result<()> {
3276    // In-place compaction: memmem finds consecutive pairs, then gap-copy
3277    // in the same buffer to remove duplicates. Single write_all per chunk
3278    // eliminates writev overhead (saves ~5-10 syscalls for 10MB input).
3279    let pair = [ch, ch];
3280    let finder = memchr::memmem::Finder::new(&pair);
3281    let mut buf = alloc_uninit_vec(STREAM_BUF);
3282    let mut was_squeeze_char = false;
3283
3284    loop {
3285        let n = read_once(reader, &mut buf)?;
3286        if n == 0 {
3287            break;
3288        }
3289
3290        let mut i = 0;
3291
3292        // Handle carry-over: if previous chunk ended with squeeze char,
3293        // skip leading occurrences of that char in this chunk.
3294        if was_squeeze_char {
3295            while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3296                i += 1;
3297            }
3298            if i >= n {
3299                continue;
3300            }
3301        }
3302
3303        // In-place compaction: scan for consecutive pairs and remove duplicates.
3304        let ptr = buf.as_mut_ptr();
3305        let mut wp = 0usize;
3306
3307        loop {
3308            match finder.find(&buf[i..n]) {
3309                Some(offset) => {
3310                    // Copy everything up to and including the first char of the pair
3311                    let seg_end = i + offset + 1;
3312                    let gap = seg_end - i;
3313                    if gap > 0 {
3314                        if wp != i {
3315                            unsafe {
3316                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3317                            }
3318                        }
3319                        wp += gap;
3320                    }
3321                    i = seg_end;
3322                    // Skip all remaining consecutive ch bytes (the run)
3323                    while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3324                        i += 1;
3325                    }
3326                    if i >= n {
3327                        was_squeeze_char = true;
3328                        break;
3329                    }
3330                }
3331                None => {
3332                    // No more consecutive pairs — copy remainder
3333                    let rem = n - i;
3334                    if rem > 0 {
3335                        if wp != i {
3336                            unsafe {
3337                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3338                            }
3339                        }
3340                        wp += rem;
3341                    }
3342                    was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3343                    break;
3344                }
3345            }
3346        }
3347
3348        if wp > 0 {
3349            writer.write_all(&buf[..wp])?;
3350        }
3351    }
3352    Ok(())
3353}
3354
3355// ============================================================================
3356// Batch in-place functions (owned data from piped stdin)
3357// ============================================================================
3358
3359/// Translate bytes in-place on an owned buffer, then write.
3360/// For piped stdin where we own the data, this avoids the separate output buffer
3361/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
3362pub fn translate_owned(
3363    set1: &[u8],
3364    set2: &[u8],
3365    data: &mut [u8],
3366    writer: &mut impl Write,
3367) -> io::Result<()> {
3368    let table = build_translate_table(set1, set2);
3369
3370    // Identity table — pure passthrough
3371    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3372    if is_identity {
3373        return writer.write_all(data);
3374    }
3375
3376    // SIMD range fast path (in-place)
3377    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3378        if data.len() >= PARALLEL_THRESHOLD {
3379            let n_threads = rayon::current_num_threads().max(1);
3380            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3381            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3382                translate_range_simd_inplace(chunk, lo, hi, offset);
3383            });
3384        } else {
3385            translate_range_simd_inplace(data, lo, hi, offset);
3386        }
3387        return writer.write_all(data);
3388    }
3389
3390    // SIMD range-to-constant fast path (in-place)
3391    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3392        if data.len() >= PARALLEL_THRESHOLD {
3393            let n_threads = rayon::current_num_threads().max(1);
3394            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3395            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3396                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3397            });
3398        } else {
3399            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3400        }
3401        return writer.write_all(data);
3402    }
3403
3404    // General table lookup (in-place)
3405    if data.len() >= PARALLEL_THRESHOLD {
3406        let n_threads = rayon::current_num_threads().max(1);
3407        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3408        data.par_chunks_mut(chunk_size).for_each(|chunk| {
3409            translate_inplace(chunk, &table);
3410        });
3411    } else {
3412        translate_inplace(data, &table);
3413    }
3414    writer.write_all(data)
3415}
3416
3417// ============================================================================
3418// Mmap-based functions (zero-copy input from byte slice)
3419// ============================================================================
3420
3421/// Maximum data size for single-allocation translate approach.
3422/// Translate bytes from an mmap'd byte slice.
3423/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
3424/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
3425/// Falls back to scalar 256-byte table lookup for general translations.
3426///
3427/// For data >= 2MB: uses rayon parallel processing across multiple cores.
3428/// For data <= 16MB: single allocation + single write_all (1 syscall).
3429/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
3430pub fn translate_mmap(
3431    set1: &[u8],
3432    set2: &[u8],
3433    data: &[u8],
3434    writer: &mut impl Write,
3435) -> io::Result<()> {
3436    let table = build_translate_table(set1, set2);
3437
3438    // Check if table is identity — pure passthrough
3439    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3440    if is_identity {
3441        return writer.write_all(data);
3442    }
3443
3444    // Try SIMD fast path for single-range constant-offset translations
3445    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3446        return translate_mmap_range(data, writer, lo, hi, offset);
3447    }
3448
3449    // Try SIMD fast path for range-to-constant translations
3450    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3451        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3452    }
3453
3454    // General case: table lookup (with parallel processing for large data)
3455    translate_mmap_table(data, writer, &table)
3456}
3457
3458/// SIMD range translate for mmap data, with rayon parallel processing.
3459fn translate_mmap_range(
3460    data: &[u8],
3461    writer: &mut impl Write,
3462    lo: u8,
3463    hi: u8,
3464    offset: i8,
3465) -> io::Result<()> {
3466    // Parallel path: split data into chunks, translate each in parallel
3467    if data.len() >= PARALLEL_THRESHOLD {
3468        let mut buf = alloc_uninit_vec(data.len());
3469        let n_threads = rayon::current_num_threads().max(1);
3470        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3471
3472        // Process chunks in parallel: each thread writes to its slice of buf
3473        data.par_chunks(chunk_size)
3474            .zip(buf.par_chunks_mut(chunk_size))
3475            .for_each(|(src_chunk, dst_chunk)| {
3476                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3477            });
3478
3479        return writer.write_all(&buf);
3480    }
3481
3482    // Chunked SIMD translate: 256KB buffer fits in L2 cache.
3483    const CHUNK: usize = 256 * 1024;
3484    let buf_size = data.len().min(CHUNK);
3485    let mut buf = alloc_uninit_vec(buf_size);
3486    for chunk in data.chunks(CHUNK) {
3487        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3488        writer.write_all(&buf[..chunk.len()])?;
3489    }
3490    Ok(())
3491}
3492
3493/// SIMD range-to-constant translate for mmap data.
3494/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
3495fn translate_mmap_range_to_constant(
3496    data: &[u8],
3497    writer: &mut impl Write,
3498    lo: u8,
3499    hi: u8,
3500    replacement: u8,
3501) -> io::Result<()> {
3502    // For mmap data (read-only), copy to buffer and translate in-place
3503    if data.len() >= PARALLEL_THRESHOLD {
3504        let mut buf = alloc_uninit_vec(data.len());
3505        let n_threads = rayon::current_num_threads().max(1);
3506        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3507
3508        // Copy + translate in parallel
3509        data.par_chunks(chunk_size)
3510            .zip(buf.par_chunks_mut(chunk_size))
3511            .for_each(|(src_chunk, dst_chunk)| {
3512                dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3513                translate_range_to_constant_simd_inplace(
3514                    &mut dst_chunk[..src_chunk.len()],
3515                    lo,
3516                    hi,
3517                    replacement,
3518                );
3519            });
3520
3521        return writer.write_all(&buf);
3522    }
3523
3524    // Chunked translate: 256KB buffer fits in L2 cache.
3525    const CHUNK: usize = 256 * 1024;
3526    let buf_size = data.len().min(CHUNK);
3527    let mut buf = alloc_uninit_vec(buf_size);
3528    for chunk in data.chunks(CHUNK) {
3529        buf[..chunk.len()].copy_from_slice(chunk);
3530        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3531        writer.write_all(&buf[..chunk.len()])?;
3532    }
3533    Ok(())
3534}
3535
3536/// General table-lookup translate for mmap data, with rayon parallel processing.
3537fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3538    // Parallel path: split data into chunks, translate each in parallel
3539    if data.len() >= PARALLEL_THRESHOLD {
3540        let mut buf = alloc_uninit_vec(data.len());
3541        let n_threads = rayon::current_num_threads().max(1);
3542        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3543
3544        data.par_chunks(chunk_size)
3545            .zip(buf.par_chunks_mut(chunk_size))
3546            .for_each(|(src_chunk, dst_chunk)| {
3547                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3548            });
3549
3550        return writer.write_all(&buf);
3551    }
3552
3553    // Chunked translate: 256KB buffer fits in L2 cache.
3554    const CHUNK: usize = 256 * 1024;
3555    let buf_size = data.len().min(CHUNK);
3556    let mut buf = alloc_uninit_vec(buf_size);
3557    for chunk in data.chunks(CHUNK) {
3558        translate_to(chunk, &mut buf[..chunk.len()], table);
3559        writer.write_all(&buf[..chunk.len()])?;
3560    }
3561    Ok(())
3562}
3563
3564/// Translate bytes in-place on a mutable buffer (e.g., MAP_PRIVATE mmap).
3565/// Eliminates the output buffer allocation entirely — the kernel's COW
3566/// semantics mean only modified pages are physically copied.
3567///
3568/// For data >= PARALLEL_THRESHOLD: rayon parallel in-place translate.
3569/// Otherwise: single-threaded in-place translate.
3570pub fn translate_mmap_inplace(
3571    set1: &[u8],
3572    set2: &[u8],
3573    data: &mut [u8],
3574    writer: &mut impl Write,
3575) -> io::Result<()> {
3576    let table = build_translate_table(set1, set2);
3577
3578    // Check if table is identity — pure passthrough
3579    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3580    if is_identity {
3581        return writer.write_all(data);
3582    }
3583
3584    // For data that's being translated in a MAP_PRIVATE mmap, every modified page
3585    // triggers a COW fault. For small-to-medium files where most bytes change,
3586    // reading from mmap (read-only) + writing to a separate heap buffer is faster
3587    // because it avoids COW faults entirely. The output buffer is fresh memory
3588    // (no COW), and the input mmap stays read-only (MADV_SEQUENTIAL).
3589    // Threshold: 64MB. For benchmark-sized files (10MB), avoid COW entirely.
3590    const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3591
3592    if data.len() < SEPARATE_BUF_THRESHOLD {
3593        return translate_to_separate_buf(data, &table, writer);
3594    }
3595
3596    // Try SIMD fast path for single-range constant-offset translations (e.g., a-z -> A-Z)
3597    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3598        if data.len() >= PARALLEL_THRESHOLD {
3599            let n_threads = rayon::current_num_threads().max(1);
3600            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3601            data.par_chunks_mut(chunk_size)
3602                .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3603        } else {
3604            translate_range_simd_inplace(data, lo, hi, offset);
3605        }
3606        return writer.write_all(data);
3607    }
3608
3609    // Try SIMD fast path for range-to-constant translations
3610    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3611        if data.len() >= PARALLEL_THRESHOLD {
3612            let n_threads = rayon::current_num_threads().max(1);
3613            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3614            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3615                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3616            });
3617        } else {
3618            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3619        }
3620        return writer.write_all(data);
3621    }
3622
3623    // General case: in-place table lookup
3624    if data.len() >= PARALLEL_THRESHOLD {
3625        let n_threads = rayon::current_num_threads().max(1);
3626        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3627        data.par_chunks_mut(chunk_size)
3628            .for_each(|chunk| translate_inplace(chunk, &table));
3629    } else {
3630        translate_inplace(data, &table);
3631    }
3632    writer.write_all(data)
3633}
3634
3635/// Translate from read-only source to a separate output buffer, avoiding COW faults.
3636/// Uses the appropriate SIMD path (range offset, range-to-constant, or general nibble).
3637///
3638/// For data >= PARALLEL_THRESHOLD: parallel chunked translate into full-size buffer.
3639/// For smaller data: single full-size allocation + single write_all for minimum
3640/// syscall overhead. At 10MB, the allocation is cheap and a single write() is faster
3641/// than multiple 4MB chunked writes.
3642fn translate_to_separate_buf(
3643    data: &[u8],
3644    table: &[u8; 256],
3645    writer: &mut impl Write,
3646) -> io::Result<()> {
3647    let range_info = detect_range_offset(table);
3648    let const_info = if range_info.is_none() {
3649        detect_range_to_constant(table)
3650    } else {
3651        None
3652    };
3653
3654    if data.len() >= PARALLEL_THRESHOLD {
3655        // Parallel path: full-size output buffer, parallel translate, single write.
3656        let mut out_buf = alloc_uninit_vec(data.len());
3657        let n_threads = rayon::current_num_threads().max(1);
3658        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3659
3660        if let Some((lo, hi, offset)) = range_info {
3661            data.par_chunks(chunk_size)
3662                .zip(out_buf.par_chunks_mut(chunk_size))
3663                .for_each(|(src, dst)| {
3664                    translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3665                });
3666        } else if let Some((lo, hi, replacement)) = const_info {
3667            data.par_chunks(chunk_size)
3668                .zip(out_buf.par_chunks_mut(chunk_size))
3669                .for_each(|(src, dst)| {
3670                    translate_range_to_constant_simd(
3671                        src,
3672                        &mut dst[..src.len()],
3673                        lo,
3674                        hi,
3675                        replacement,
3676                    );
3677                });
3678        } else {
3679            data.par_chunks(chunk_size)
3680                .zip(out_buf.par_chunks_mut(chunk_size))
3681                .for_each(|(src, dst)| {
3682                    translate_to(src, &mut dst[..src.len()], table);
3683                });
3684        }
3685        return writer.write_all(&out_buf);
3686    }
3687
3688    // Single-allocation translate: full-size output buffer, single translate, single write.
3689    // For 10MB data, this does 1 write() instead of 40 chunked writes, eliminating
3690    // 39 write() syscalls. SIMD translate streams through src and dst sequentially,
3691    // so the L2 cache argument for 256KB chunks doesn't apply (src data doesn't fit
3692    // in L2 anyway). The reduced syscall overhead more than compensates.
3693    let mut out_buf = alloc_uninit_vec(data.len());
3694    if let Some((lo, hi, offset)) = range_info {
3695        translate_range_simd(data, &mut out_buf, lo, hi, offset);
3696    } else if let Some((lo, hi, replacement)) = const_info {
3697        translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3698    } else {
3699        translate_to(data, &mut out_buf, table);
3700    }
3701    writer.write_all(&out_buf)
3702}
3703
3704/// Translate from a read-only mmap (or any byte slice) to a separate output buffer.
3705/// Avoids MAP_PRIVATE COW page faults by reading from the original data and
3706/// writing to a freshly allocated heap buffer.
3707pub fn translate_mmap_readonly(
3708    set1: &[u8],
3709    set2: &[u8],
3710    data: &[u8],
3711    writer: &mut impl Write,
3712) -> io::Result<()> {
3713    let table = build_translate_table(set1, set2);
3714
3715    // Check if table is identity — pure passthrough
3716    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3717    if is_identity {
3718        return writer.write_all(data);
3719    }
3720
3721    translate_to_separate_buf(data, &table, writer)
3722}
3723
3724/// Translate + squeeze from mmap'd byte slice.
3725///
3726/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
3727/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
3728/// For data > 16MB: chunked approach to limit memory.
3729pub fn translate_squeeze_mmap(
3730    set1: &[u8],
3731    set2: &[u8],
3732    data: &[u8],
3733    writer: &mut impl Write,
3734) -> io::Result<()> {
3735    let table = build_translate_table(set1, set2);
3736    let squeeze_set = build_member_set(set2);
3737
3738    // For large data: two-phase approach
3739    // Phase 1: parallel translate into buffer
3740    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
3741    //          (squeeze only removes bytes, never grows, so no second allocation needed)
3742    if data.len() >= PARALLEL_THRESHOLD {
3743        // Phase 1: parallel translate
3744        let mut translated = alloc_uninit_vec(data.len());
3745        let range_info = detect_range_offset(&table);
3746        let n_threads = rayon::current_num_threads().max(1);
3747        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3748
3749        if let Some((lo, hi, offset)) = range_info {
3750            data.par_chunks(chunk_size)
3751                .zip(translated.par_chunks_mut(chunk_size))
3752                .for_each(|(src_chunk, dst_chunk)| {
3753                    translate_range_simd(
3754                        src_chunk,
3755                        &mut dst_chunk[..src_chunk.len()],
3756                        lo,
3757                        hi,
3758                        offset,
3759                    );
3760                });
3761        } else {
3762            data.par_chunks(chunk_size)
3763                .zip(translated.par_chunks_mut(chunk_size))
3764                .for_each(|(src_chunk, dst_chunk)| {
3765                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
3766                });
3767        }
3768
3769        // Phase 2: squeeze in-place on the translated buffer.
3770        // Since squeeze only removes bytes (never grows), we can read ahead and
3771        // compact into the same buffer, saving a full data.len() heap allocation.
3772        let mut last_squeezed: u16 = 256;
3773        let len = translated.len();
3774        let mut wp = 0;
3775        unsafe {
3776            let ptr = translated.as_mut_ptr();
3777            let mut i = 0;
3778            while i < len {
3779                let b = *ptr.add(i);
3780                if is_member(&squeeze_set, b) {
3781                    if last_squeezed == b as u16 {
3782                        i += 1;
3783                        continue;
3784                    }
3785                    last_squeezed = b as u16;
3786                } else {
3787                    last_squeezed = 256;
3788                }
3789                *ptr.add(wp) = b;
3790                wp += 1;
3791                i += 1;
3792            }
3793        }
3794        return writer.write_all(&translated[..wp]);
3795    }
3796
3797    // Single-allocation translate+squeeze: full-size buffer, single write_all.
3798    // For 10MB data, this does 1 write() instead of ~40 chunked writes.
3799    let mut buf = alloc_uninit_vec(data.len());
3800    translate_to(data, &mut buf, &table);
3801    let mut last_squeezed: u16 = 256;
3802    let mut wp = 0;
3803    unsafe {
3804        let ptr = buf.as_mut_ptr();
3805        for i in 0..data.len() {
3806            let b = *ptr.add(i);
3807            if is_member(&squeeze_set, b) {
3808                if last_squeezed == b as u16 {
3809                    continue;
3810                }
3811                last_squeezed = b as u16;
3812            } else {
3813                last_squeezed = 256;
3814            }
3815            *ptr.add(wp) = b;
3816            wp += 1;
3817        }
3818    }
3819    writer.write_all(&buf[..wp])
3820}
3821
3822/// Delete from mmap'd byte slice.
3823///
3824/// For data >= 2MB: uses rayon parallel processing across multiple cores.
3825/// For data <= 16MB: delete into one buffer, one write syscall.
3826/// For data > 16MB: chunked approach to limit memory.
3827pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
3828    if delete_chars.len() == 1 {
3829        return delete_single_char_mmap(delete_chars[0], data, writer);
3830    }
3831    if delete_chars.len() <= 3 {
3832        return delete_multi_memchr_mmap(delete_chars, data, writer);
3833    }
3834
3835    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
3836    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
3837        return delete_range_mmap(data, writer, lo, hi);
3838    }
3839
3840    let member = build_member_set(delete_chars);
3841
3842    // Heuristic: estimate total delete positions. Zero-copy writev is only efficient
3843    // when all gaps fit in a single writev call (< MAX_IOV/2 entries). With uniform
3844    // distribution, each delete creates an IoSlice entry. For many deletes (> 512),
3845    // multiple writev calls are needed, and the compact approach is faster.
3846    let sample_size = data.len().min(1024);
3847    let sample_deletes = data[..sample_size]
3848        .iter()
3849        .filter(|&&b| is_member(&member, b))
3850        .count();
3851    let estimated_deletes = if sample_size > 0 {
3852        data.len() * sample_deletes / sample_size
3853    } else {
3854        data.len()
3855    };
3856
3857    if estimated_deletes < MAX_IOV / 2 {
3858        return delete_bitset_zerocopy(data, &member, writer);
3859    }
3860
3861    // Dense delete: parallel compact with writev (avoids scatter-gather copy)
3862    if data.len() >= PARALLEL_THRESHOLD {
3863        let n_threads = rayon::current_num_threads().max(1);
3864        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3865
3866        let mut outbuf = alloc_uninit_vec(data.len());
3867        let chunk_lens: Vec<usize> = data
3868            .par_chunks(chunk_size)
3869            .zip(outbuf.par_chunks_mut(chunk_size))
3870            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
3871            .collect();
3872
3873        // Use writev to write each chunk at its original position, avoiding
3874        // the O(N) scatter-gather memmove. With ~4 threads, that's 4 IoSlice
3875        // entries — far below MAX_IOV.
3876        let slices: Vec<std::io::IoSlice> = chunk_lens
3877            .iter()
3878            .enumerate()
3879            .filter(|&(_, &len)| len > 0)
3880            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
3881            .collect();
3882        return write_ioslices(writer, &slices);
3883    }
3884
3885    // Streaming compact: 256KB output buffer reduces page fault overhead.
3886    // For 10MB data: ~64 page faults instead of ~2500, with ~40 write_all calls.
3887    const COMPACT_BUF: usize = 256 * 1024;
3888    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
3889
3890    for chunk in data.chunks(COMPACT_BUF) {
3891        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
3892        if out_pos > 0 {
3893            writer.write_all(&outbuf[..out_pos])?;
3894        }
3895    }
3896    Ok(())
3897}
3898
3899/// SIMD range delete for mmap data.
3900/// Uses a density heuristic: for sparse deletes (< 15%), uses zero-copy writev
3901/// directly from mmap data (no output buffer allocation). For dense deletes,
3902/// uses SIMD compact into a pre-allocated buffer.
3903fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
3904    // Sample first 1024 bytes to estimate delete density
3905    let sample_size = data.len().min(1024);
3906    let sample_deletes = data[..sample_size]
3907        .iter()
3908        .filter(|&&b| b >= lo && b <= hi)
3909        .count();
3910    // Estimate expected number of delete positions (IoSlice entries for zero-copy).
3911    // Each delete creates an IoSlice entry. With MAX_IOV=1024 per writev,
3912    // if estimated_deletes > MAX_IOV/2, the writev overhead from multiple syscalls
3913    // exceeds the compact approach cost. Only use zero-copy when all gaps fit in
3914    // a single writev call.
3915    let estimated_deletes = if sample_size > 0 {
3916        data.len() * sample_deletes / sample_size
3917    } else {
3918        data.len()
3919    };
3920    if estimated_deletes < MAX_IOV / 2 {
3921        return delete_range_mmap_zerocopy(data, writer, lo, hi);
3922    }
3923
3924    // Dense deletes: parallel compact with writev (avoids scatter-gather copy)
3925    if data.len() >= PARALLEL_THRESHOLD {
3926        let n_threads = rayon::current_num_threads().max(1);
3927        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3928
3929        let mut outbuf = alloc_uninit_vec(data.len());
3930        let chunk_lens: Vec<usize> = data
3931            .par_chunks(chunk_size)
3932            .zip(outbuf.par_chunks_mut(chunk_size))
3933            .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
3934            .collect();
3935
3936        // Use writev to write each chunk at its original position, avoiding
3937        // the O(N) scatter-gather memmove.
3938        let slices: Vec<std::io::IoSlice> = chunk_lens
3939            .iter()
3940            .enumerate()
3941            .filter(|&(_, &len)| len > 0)
3942            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
3943            .collect();
3944        return write_ioslices(writer, &slices);
3945    }
3946
3947    // Streaming compact: use 256KB output buffer instead of full data.len() buffer.
3948    // This reduces page fault overhead from ~2500 faults (10MB) to ~64 faults (256KB).
3949    // The extra write_all calls (~40 for 10MB) are negligible cost.
3950    const COMPACT_BUF: usize = 256 * 1024;
3951    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
3952
3953    #[cfg(target_arch = "x86_64")]
3954    {
3955        let mut wp = 0;
3956        let level = get_simd_level();
3957        let len = data.len();
3958        let sp = data.as_ptr();
3959        let dp = outbuf.as_mut_ptr();
3960        let mut ri = 0;
3961
3962        if level >= 3 {
3963            use std::arch::x86_64::*;
3964            let range = hi - lo;
3965            let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
3966            let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
3967            let zero = unsafe { _mm256_setzero_si256() };
3968
3969            while ri + 32 <= len {
3970                // Flush when output buffer is nearly full
3971                if wp + 32 > COMPACT_BUF {
3972                    writer.write_all(&outbuf[..wp])?;
3973                    wp = 0;
3974                }
3975
3976                let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
3977                let biased = unsafe { _mm256_add_epi8(input, bias_v) };
3978                let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
3979                let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
3980                let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
3981
3982                if keep_mask == 0xFFFFFFFF {
3983                    unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
3984                    wp += 32;
3985                } else if keep_mask != 0 {
3986                    let m0 = keep_mask as u8;
3987                    let m1 = (keep_mask >> 8) as u8;
3988                    let m2 = (keep_mask >> 16) as u8;
3989                    let m3 = (keep_mask >> 24) as u8;
3990
3991                    if m0 == 0xFF {
3992                        unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
3993                    } else if m0 != 0 {
3994                        unsafe { compact_8bytes(sp.add(ri), dp.add(wp), m0) };
3995                    }
3996                    let c0 = m0.count_ones() as usize;
3997
3998                    if m1 == 0xFF {
3999                        unsafe {
4000                            std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4001                        };
4002                    } else if m1 != 0 {
4003                        unsafe { compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1) };
4004                    }
4005                    let c1 = m1.count_ones() as usize;
4006
4007                    if m2 == 0xFF {
4008                        unsafe {
4009                            std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4010                        };
4011                    } else if m2 != 0 {
4012                        unsafe { compact_8bytes(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4013                    }
4014                    let c2 = m2.count_ones() as usize;
4015
4016                    if m3 == 0xFF {
4017                        unsafe {
4018                            std::ptr::copy_nonoverlapping(
4019                                sp.add(ri + 24),
4020                                dp.add(wp + c0 + c1 + c2),
4021                                8,
4022                            )
4023                        };
4024                    } else if m3 != 0 {
4025                        unsafe { compact_8bytes(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3) };
4026                    }
4027                    let c3 = m3.count_ones() as usize;
4028                    wp += c0 + c1 + c2 + c3;
4029                }
4030                ri += 32;
4031            }
4032        }
4033
4034        // Scalar tail
4035        while ri < len {
4036            if wp + 1 > COMPACT_BUF {
4037                writer.write_all(&outbuf[..wp])?;
4038                wp = 0;
4039            }
4040            let b = unsafe { *sp.add(ri) };
4041            unsafe { *dp.add(wp) = b };
4042            wp += (b < lo || b > hi) as usize;
4043            ri += 1;
4044        }
4045
4046        if wp > 0 {
4047            writer.write_all(&outbuf[..wp])?;
4048        }
4049        return Ok(());
4050    }
4051
4052    #[cfg(not(target_arch = "x86_64"))]
4053    {
4054        // Non-x86 fallback: chunk the source and process with delete_range_chunk
4055        for chunk in data.chunks(COMPACT_BUF) {
4056            let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4057            if clen > 0 {
4058                writer.write_all(&outbuf[..clen])?;
4059            }
4060        }
4061        return Ok(());
4062    }
4063
4064    #[allow(unreachable_code)]
4065    Ok(())
4066}
4067
4068/// Zero-copy range delete for mmap data: SIMD-scans for bytes in [lo..=hi],
4069/// builds IoSlice entries pointing to the gaps between deleted ranges in the
4070/// original mmap data, and writes using writev. No output buffer allocation.
4071/// For 10MB text with 4% digits: ~1.5ms vs ~4ms for the compact approach.
4072fn delete_range_mmap_zerocopy(
4073    data: &[u8],
4074    writer: &mut impl Write,
4075    lo: u8,
4076    hi: u8,
4077) -> io::Result<()> {
4078    #[cfg(target_arch = "x86_64")]
4079    {
4080        if get_simd_level() >= 3 {
4081            return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4082        }
4083        if get_simd_level() >= 2 {
4084            return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4085        }
4086    }
4087
4088    #[cfg(target_arch = "aarch64")]
4089    {
4090        return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4091    }
4092
4093    // Scalar fallback: byte-by-byte scan with IoSlice batching
4094    #[allow(unreachable_code)]
4095    delete_range_zerocopy_scalar(data, writer, lo, hi)
4096}
4097
4098/// Scalar zero-copy range delete: byte-by-byte scan with IoSlice batching.
4099/// Used as fallback when SIMD is unavailable.
4100fn delete_range_zerocopy_scalar(
4101    data: &[u8],
4102    writer: &mut impl Write,
4103    lo: u8,
4104    hi: u8,
4105) -> io::Result<()> {
4106    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4107    let len = data.len();
4108    let mut run_start: usize = 0;
4109    let mut i: usize = 0;
4110
4111    while i < len {
4112        let b = unsafe { *data.get_unchecked(i) };
4113        if b >= lo && b <= hi {
4114            if i > run_start {
4115                iov.push(std::io::IoSlice::new(&data[run_start..i]));
4116                if iov.len() >= MAX_IOV {
4117                    write_ioslices(writer, &iov)?;
4118                    iov.clear();
4119                }
4120            }
4121            run_start = i + 1;
4122        }
4123        i += 1;
4124    }
4125    if run_start < len {
4126        iov.push(std::io::IoSlice::new(&data[run_start..]));
4127    }
4128    if !iov.is_empty() {
4129        write_ioslices(writer, &iov)?;
4130    }
4131    Ok(())
4132}
4133
4134/// AVX2 zero-copy range delete: scans 32 bytes at a time using SIMD range
4135/// comparison, then iterates only the delete positions from the bitmask.
4136/// Blocks with no deletes (common for sparse data) skip with zero per-byte work.
4137#[cfg(target_arch = "x86_64")]
4138#[target_feature(enable = "avx2")]
4139unsafe fn delete_range_zerocopy_avx2(
4140    data: &[u8],
4141    writer: &mut impl Write,
4142    lo: u8,
4143    hi: u8,
4144) -> io::Result<()> {
4145    use std::arch::x86_64::*;
4146
4147    unsafe {
4148        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4149        let len = data.len();
4150        let mut run_start: usize = 0;
4151        let mut ri: usize = 0;
4152
4153        let range = hi - lo;
4154        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4155        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4156        let zero = _mm256_setzero_si256();
4157
4158        while ri + 32 <= len {
4159            let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4160            let biased = _mm256_add_epi8(input, bias_v);
4161            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4162            let in_range = _mm256_cmpeq_epi8(gt, zero);
4163            let del_mask = _mm256_movemask_epi8(in_range) as u32;
4164
4165            if del_mask == 0 {
4166                // No bytes to delete — run continues
4167                ri += 32;
4168                continue;
4169            }
4170
4171            // Process each deleted byte position from the bitmask
4172            let mut m = del_mask;
4173            while m != 0 {
4174                let bit = m.trailing_zeros() as usize;
4175                let abs_pos = ri + bit;
4176                if abs_pos > run_start {
4177                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4178                    if iov.len() >= MAX_IOV {
4179                        write_ioslices(writer, &iov)?;
4180                        iov.clear();
4181                    }
4182                }
4183                run_start = abs_pos + 1;
4184                m &= m - 1; // clear lowest set bit (blsr)
4185            }
4186
4187            ri += 32;
4188        }
4189
4190        // Scalar tail
4191        while ri < len {
4192            let b = *data.get_unchecked(ri);
4193            if b >= lo && b <= hi {
4194                if ri > run_start {
4195                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4196                    if iov.len() >= MAX_IOV {
4197                        write_ioslices(writer, &iov)?;
4198                        iov.clear();
4199                    }
4200                }
4201                run_start = ri + 1;
4202            }
4203            ri += 1;
4204        }
4205
4206        if run_start < len {
4207            iov.push(std::io::IoSlice::new(&data[run_start..]));
4208        }
4209        if !iov.is_empty() {
4210            write_ioslices(writer, &iov)?;
4211        }
4212        Ok(())
4213    }
4214}
4215
4216/// SSE2 zero-copy range delete: same approach as AVX2 but with 16-byte blocks.
4217#[cfg(target_arch = "x86_64")]
4218#[target_feature(enable = "sse2")]
4219unsafe fn delete_range_zerocopy_sse2(
4220    data: &[u8],
4221    writer: &mut impl Write,
4222    lo: u8,
4223    hi: u8,
4224) -> io::Result<()> {
4225    use std::arch::x86_64::*;
4226
4227    unsafe {
4228        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4229        let len = data.len();
4230        let mut run_start: usize = 0;
4231        let mut ri: usize = 0;
4232
4233        let range = hi - lo;
4234        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4235        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4236        let zero = _mm_setzero_si128();
4237
4238        while ri + 16 <= len {
4239            let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4240            let biased = _mm_add_epi8(input, bias_v);
4241            let gt = _mm_cmpgt_epi8(biased, threshold_v);
4242            let in_range = _mm_cmpeq_epi8(gt, zero);
4243            let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4244
4245            if del_mask == 0 {
4246                ri += 16;
4247                continue;
4248            }
4249
4250            let mut m = del_mask;
4251            while m != 0 {
4252                let bit = m.trailing_zeros() as usize;
4253                let abs_pos = ri + bit;
4254                if abs_pos > run_start {
4255                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4256                    if iov.len() >= MAX_IOV {
4257                        write_ioslices(writer, &iov)?;
4258                        iov.clear();
4259                    }
4260                }
4261                run_start = abs_pos + 1;
4262                m &= m - 1;
4263            }
4264
4265            ri += 16;
4266        }
4267
4268        while ri < len {
4269            let b = *data.get_unchecked(ri);
4270            if b >= lo && b <= hi {
4271                if ri > run_start {
4272                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4273                    if iov.len() >= MAX_IOV {
4274                        write_ioslices(writer, &iov)?;
4275                        iov.clear();
4276                    }
4277                }
4278                run_start = ri + 1;
4279            }
4280            ri += 1;
4281        }
4282
4283        if run_start < len {
4284            iov.push(std::io::IoSlice::new(&data[run_start..]));
4285        }
4286        if !iov.is_empty() {
4287            write_ioslices(writer, &iov)?;
4288        }
4289        Ok(())
4290    }
4291}
4292
4293/// NEON zero-copy range delete for aarch64: scans 16 bytes at a time using
4294/// NEON unsigned comparison, creates bitmask via pairwise narrowing, then
4295/// iterates delete positions from the bitmask.
4296#[cfg(target_arch = "aarch64")]
4297#[target_feature(enable = "neon")]
4298unsafe fn delete_range_zerocopy_neon(
4299    data: &[u8],
4300    writer: &mut impl Write,
4301    lo: u8,
4302    hi: u8,
4303) -> io::Result<()> {
4304    use std::arch::aarch64::*;
4305
4306    unsafe {
4307        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4308        let len = data.len();
4309        let mut run_start: usize = 0;
4310        let mut ri: usize = 0;
4311
4312        let lo_v = vdupq_n_u8(lo);
4313        let hi_v = vdupq_n_u8(hi);
4314        // Bit position mask for extracting bitmask from comparison results
4315        let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4316        let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4317
4318        while ri + 16 <= len {
4319            let input = vld1q_u8(data.as_ptr().add(ri));
4320            // in_range = 0xFF where lo <= byte <= hi
4321            let ge_lo = vcgeq_u8(input, lo_v);
4322            let le_hi = vcleq_u8(input, hi_v);
4323            let in_range = vandq_u8(ge_lo, le_hi);
4324
4325            // Create 16-bit bitmask: reduce 16 bytes to 2 bytes
4326            let bits = vandq_u8(in_range, bit_mask_v);
4327            let pair = vpaddlq_u8(bits); // u8→u16 pairwise add
4328            let quad = vpaddlq_u16(pair); // u16→u32
4329            let octet = vpaddlq_u32(quad); // u32→u64
4330            let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4331            let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4332            let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4333
4334            if del_mask == 0 {
4335                // No bytes to delete — run continues
4336                ri += 16;
4337                continue;
4338            }
4339
4340            // Process each deleted byte position
4341            let mut m = del_mask;
4342            while m != 0 {
4343                let bit = m.trailing_zeros() as usize;
4344                let abs_pos = ri + bit;
4345                if abs_pos > run_start {
4346                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4347                    if iov.len() >= MAX_IOV {
4348                        write_ioslices(writer, &iov)?;
4349                        iov.clear();
4350                    }
4351                }
4352                run_start = abs_pos + 1;
4353                m &= m - 1;
4354            }
4355
4356            ri += 16;
4357        }
4358
4359        // Scalar tail
4360        while ri < len {
4361            let b = *data.get_unchecked(ri);
4362            if b >= lo && b <= hi {
4363                if ri > run_start {
4364                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4365                    if iov.len() >= MAX_IOV {
4366                        write_ioslices(writer, &iov)?;
4367                        iov.clear();
4368                    }
4369                }
4370                run_start = ri + 1;
4371            }
4372            ri += 1;
4373        }
4374
4375        if run_start < len {
4376            iov.push(std::io::IoSlice::new(&data[run_start..]));
4377        }
4378        if !iov.is_empty() {
4379            write_ioslices(writer, &iov)?;
4380        }
4381        Ok(())
4382    }
4383}
4384
4385/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
4386/// Returns number of bytes written.
4387#[inline]
4388fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4389    let len = chunk.len();
4390    let mut out_pos = 0;
4391    let mut i = 0;
4392
4393    while i + 8 <= len {
4394        unsafe {
4395            let b0 = *chunk.get_unchecked(i);
4396            let b1 = *chunk.get_unchecked(i + 1);
4397            let b2 = *chunk.get_unchecked(i + 2);
4398            let b3 = *chunk.get_unchecked(i + 3);
4399            let b4 = *chunk.get_unchecked(i + 4);
4400            let b5 = *chunk.get_unchecked(i + 5);
4401            let b6 = *chunk.get_unchecked(i + 6);
4402            let b7 = *chunk.get_unchecked(i + 7);
4403
4404            *outbuf.get_unchecked_mut(out_pos) = b0;
4405            out_pos += !is_member(member, b0) as usize;
4406            *outbuf.get_unchecked_mut(out_pos) = b1;
4407            out_pos += !is_member(member, b1) as usize;
4408            *outbuf.get_unchecked_mut(out_pos) = b2;
4409            out_pos += !is_member(member, b2) as usize;
4410            *outbuf.get_unchecked_mut(out_pos) = b3;
4411            out_pos += !is_member(member, b3) as usize;
4412            *outbuf.get_unchecked_mut(out_pos) = b4;
4413            out_pos += !is_member(member, b4) as usize;
4414            *outbuf.get_unchecked_mut(out_pos) = b5;
4415            out_pos += !is_member(member, b5) as usize;
4416            *outbuf.get_unchecked_mut(out_pos) = b6;
4417            out_pos += !is_member(member, b6) as usize;
4418            *outbuf.get_unchecked_mut(out_pos) = b7;
4419            out_pos += !is_member(member, b7) as usize;
4420        }
4421        i += 8;
4422    }
4423
4424    while i < len {
4425        unsafe {
4426            let b = *chunk.get_unchecked(i);
4427            *outbuf.get_unchecked_mut(out_pos) = b;
4428            out_pos += !is_member(member, b) as usize;
4429        }
4430        i += 1;
4431    }
4432
4433    out_pos
4434}
4435
4436/// Zero-copy delete for general bitset: scan for runs of kept bytes,
4437/// build IoSlice entries pointing directly into the source data.
4438/// No allocation for output data — just ~16 bytes per IoSlice entry.
4439/// Flushes in MAX_IOV-sized batches for efficient writev.
4440fn delete_bitset_zerocopy(
4441    data: &[u8],
4442    member: &[u8; 32],
4443    writer: &mut impl Write,
4444) -> io::Result<()> {
4445    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4446    let len = data.len();
4447    let mut i = 0;
4448    let mut run_start: Option<usize> = None;
4449
4450    while i < len {
4451        let b = unsafe { *data.get_unchecked(i) };
4452        if is_member(member, b) {
4453            // This byte should be deleted
4454            if let Some(rs) = run_start {
4455                iov.push(std::io::IoSlice::new(&data[rs..i]));
4456                run_start = None;
4457                if iov.len() >= MAX_IOV {
4458                    write_ioslices(writer, &iov)?;
4459                    iov.clear();
4460                }
4461            }
4462        } else {
4463            // This byte should be kept
4464            if run_start.is_none() {
4465                run_start = Some(i);
4466            }
4467        }
4468        i += 1;
4469    }
4470    // Flush final run
4471    if let Some(rs) = run_start {
4472        iov.push(std::io::IoSlice::new(&data[rs..]));
4473    }
4474    if !iov.is_empty() {
4475        write_ioslices(writer, &iov)?;
4476    }
4477    Ok(())
4478}
4479
4480fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4481    // Streaming zero-copy delete using writev: build IoSlice batches of MAX_IOV
4482    // pointing to gaps between deleted characters, write each batch immediately.
4483    // Avoids allocating the full Vec<IoSlice> for all positions.
4484    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4485    let mut last = 0;
4486    for pos in memchr::memchr_iter(ch, data) {
4487        if pos > last {
4488            iov.push(std::io::IoSlice::new(&data[last..pos]));
4489            if iov.len() >= MAX_IOV {
4490                write_ioslices(writer, &iov)?;
4491                iov.clear();
4492            }
4493        }
4494        last = pos + 1;
4495    }
4496    if last < data.len() {
4497        iov.push(std::io::IoSlice::new(&data[last..]));
4498    }
4499    if !iov.is_empty() {
4500        write_ioslices(writer, &iov)?;
4501    }
4502    Ok(())
4503}
4504
4505fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4506    let c0 = chars[0];
4507    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4508    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4509    let is_three = chars.len() >= 3;
4510
4511    // Streaming zero-copy delete: batch IoSlice entries and write in groups of MAX_IOV.
4512    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4513    let mut last = 0;
4514
4515    macro_rules! process_pos {
4516        ($pos:expr) => {
4517            if $pos > last {
4518                iov.push(std::io::IoSlice::new(&data[last..$pos]));
4519                if iov.len() >= MAX_IOV {
4520                    write_ioslices(writer, &iov)?;
4521                    iov.clear();
4522                }
4523            }
4524            last = $pos + 1;
4525        };
4526    }
4527
4528    if is_three {
4529        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4530            process_pos!(pos);
4531        }
4532    } else {
4533        for pos in memchr::memchr2_iter(c0, c1, data) {
4534            process_pos!(pos);
4535        }
4536    }
4537    if last < data.len() {
4538        iov.push(std::io::IoSlice::new(&data[last..]));
4539    }
4540    if !iov.is_empty() {
4541        write_ioslices(writer, &iov)?;
4542    }
4543    Ok(())
4544}
4545
4546/// Delete + squeeze from mmap'd byte slice.
4547///
4548/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
4549/// For data > 16MB: chunked approach to limit memory.
4550pub fn delete_squeeze_mmap(
4551    delete_chars: &[u8],
4552    squeeze_chars: &[u8],
4553    data: &[u8],
4554    writer: &mut impl Write,
4555) -> io::Result<()> {
4556    let delete_set = build_member_set(delete_chars);
4557    let squeeze_set = build_member_set(squeeze_chars);
4558
4559    // Single-allocation delete+squeeze: full-size buffer, single write_all.
4560    let mut outbuf = alloc_uninit_vec(data.len());
4561    let mut last_squeezed: u16 = 256;
4562    let mut out_pos = 0;
4563
4564    for &b in data.iter() {
4565        if is_member(&delete_set, b) {
4566            continue;
4567        }
4568        if is_member(&squeeze_set, b) {
4569            if last_squeezed == b as u16 {
4570                continue;
4571            }
4572            last_squeezed = b as u16;
4573        } else {
4574            last_squeezed = 256;
4575        }
4576        unsafe {
4577            *outbuf.get_unchecked_mut(out_pos) = b;
4578        }
4579        out_pos += 1;
4580    }
4581    writer.write_all(&outbuf[..out_pos])
4582}
4583
4584/// Squeeze from mmap'd byte slice.
4585///
4586/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
4587/// For data <= 16MB: squeeze into one buffer, one write syscall.
4588/// For data > 16MB: chunked approach to limit memory.
4589pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4590    if squeeze_chars.len() == 1 {
4591        return squeeze_single_mmap(squeeze_chars[0], data, writer);
4592    }
4593    if squeeze_chars.len() == 2 {
4594        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4595    }
4596    if squeeze_chars.len() == 3 {
4597        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4598    }
4599
4600    let member = build_member_set(squeeze_chars);
4601
4602    // Parallel path: squeeze each chunk independently, then fix boundaries
4603    if data.len() >= PARALLEL_THRESHOLD {
4604        let n_threads = rayon::current_num_threads().max(1);
4605        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4606
4607        let results: Vec<Vec<u8>> = data
4608            .par_chunks(chunk_size)
4609            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4610            .collect();
4611
4612        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
4613        // and chunk N+1 starts with same byte B, and B is in squeeze set,
4614        // skip the first byte(s) of chunk N+1 that equal B.
4615        // Collect slices for writev to minimize syscalls.
4616        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4617        for (idx, result) in results.iter().enumerate() {
4618            if result.is_empty() {
4619                continue;
4620            }
4621            if idx > 0 {
4622                // Check boundary: does previous chunk end with same squeezable byte?
4623                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4624                    if is_member(&member, prev_last) {
4625                        // Skip leading bytes in this chunk that equal prev_last
4626                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
4627                        if skip < result.len() {
4628                            slices.push(std::io::IoSlice::new(&result[skip..]));
4629                        }
4630                        continue;
4631                    }
4632                }
4633            }
4634            slices.push(std::io::IoSlice::new(result));
4635        }
4636        return write_ioslices(writer, &slices);
4637    }
4638
4639    // Single-allocation squeeze: full-size buffer, single write_all.
4640    let mut outbuf = alloc_uninit_vec(data.len());
4641    let len = data.len();
4642    let mut wp = 0;
4643    let mut i = 0;
4644    let mut last_squeezed: u16 = 256;
4645
4646    unsafe {
4647        let inp = data.as_ptr();
4648        let outp = outbuf.as_mut_ptr();
4649
4650        while i < len {
4651            let b = *inp.add(i);
4652            if is_member(&member, b) {
4653                if last_squeezed != b as u16 {
4654                    *outp.add(wp) = b;
4655                    wp += 1;
4656                    last_squeezed = b as u16;
4657                }
4658                i += 1;
4659                while i < len && *inp.add(i) == b {
4660                    i += 1;
4661                }
4662            } else {
4663                last_squeezed = 256;
4664                *outp.add(wp) = b;
4665                wp += 1;
4666                i += 1;
4667            }
4668        }
4669    }
4670    writer.write_all(&outbuf[..wp])
4671}
4672
4673/// Squeeze a single chunk using bitset membership. Returns squeezed output.
4674fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4675    let len = chunk.len();
4676    let mut out = Vec::with_capacity(len);
4677    let mut last_squeezed: u16 = 256;
4678    let mut i = 0;
4679
4680    unsafe {
4681        out.set_len(len);
4682        let inp = chunk.as_ptr();
4683        let outp: *mut u8 = out.as_mut_ptr();
4684        let mut wp = 0;
4685
4686        while i < len {
4687            let b = *inp.add(i);
4688            if is_member(member, b) {
4689                if last_squeezed != b as u16 {
4690                    *outp.add(wp) = b;
4691                    wp += 1;
4692                    last_squeezed = b as u16;
4693                }
4694                i += 1;
4695                while i < len && *inp.add(i) == b {
4696                    i += 1;
4697                }
4698            } else {
4699                last_squeezed = 256;
4700                *outp.add(wp) = b;
4701                wp += 1;
4702                i += 1;
4703            }
4704        }
4705        out.set_len(wp);
4706    }
4707    out
4708}
4709
4710fn squeeze_multi_mmap<const N: usize>(
4711    chars: &[u8],
4712    data: &[u8],
4713    writer: &mut impl Write,
4714) -> io::Result<()> {
4715    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
4716    if data.len() >= PARALLEL_THRESHOLD {
4717        let member = build_member_set(chars);
4718        let n_threads = rayon::current_num_threads().max(1);
4719        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4720
4721        let results: Vec<Vec<u8>> = data
4722            .par_chunks(chunk_size)
4723            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4724            .collect();
4725
4726        // Build IoSlice list, fixing boundaries
4727        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4728        for (idx, result) in results.iter().enumerate() {
4729            if result.is_empty() {
4730                continue;
4731            }
4732            if idx > 0 {
4733                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4734                    if is_member(&member, prev_last) {
4735                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
4736                        if skip < result.len() {
4737                            slices.push(std::io::IoSlice::new(&result[skip..]));
4738                        }
4739                        continue;
4740                    }
4741                }
4742            }
4743            slices.push(std::io::IoSlice::new(result));
4744        }
4745        return write_ioslices(writer, &slices);
4746    }
4747
4748    // Zero-copy writev: build IoSlice entries pointing directly into
4749    // the original mmap'd data, keeping one byte per run of squeezable chars.
4750    // Each IoSlice points at the gap between squeeze points (inclusive of
4751    // the first byte of a run) — no data is copied.
4752    let single = [chars[0]; 1]; // scratch for emitting single squeeze byte
4753    let _ = single;
4754    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4755    let mut cursor = 0;
4756    let mut last_squeezed: u16 = 256;
4757
4758    macro_rules! find_next {
4759        ($data:expr) => {
4760            if N == 2 {
4761                memchr::memchr2(chars[0], chars[1], $data)
4762            } else {
4763                memchr::memchr3(chars[0], chars[1], chars[2], $data)
4764            }
4765        };
4766    }
4767
4768    while cursor < data.len() {
4769        match find_next!(&data[cursor..]) {
4770            Some(offset) => {
4771                let pos = cursor + offset;
4772                let b = data[pos];
4773                // Emit gap before squeeze point
4774                if pos > cursor {
4775                    iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4776                    last_squeezed = 256;
4777                }
4778                // Emit single byte if not duplicate
4779                if last_squeezed != b as u16 {
4780                    // Point at the byte in the original data (zero-copy)
4781                    iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4782                    last_squeezed = b as u16;
4783                }
4784                // Skip the run of same byte
4785                let mut skip = pos + 1;
4786                while skip < data.len() && data[skip] == b {
4787                    skip += 1;
4788                }
4789                cursor = skip;
4790                // Flush when approaching MAX_IOV
4791                if iov.len() >= MAX_IOV {
4792                    write_ioslices(writer, &iov)?;
4793                    iov.clear();
4794                }
4795            }
4796            None => {
4797                if cursor < data.len() {
4798                    iov.push(std::io::IoSlice::new(&data[cursor..]));
4799                }
4800                break;
4801            }
4802        }
4803    }
4804    if !iov.is_empty() {
4805        write_ioslices(writer, &iov)?;
4806    }
4807    Ok(())
4808}
4809
4810fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4811    if data.is_empty() {
4812        return Ok(());
4813    }
4814
4815    // Quick check: no consecutive pairs means no squeezing needed
4816    let pair = [ch, ch];
4817    if memchr::memmem::find(data, &pair).is_none() {
4818        return writer.write_all(data);
4819    }
4820
4821    // Zero-copy writev approach: build IoSlice entries pointing directly into
4822    // the original mmap'd data, skipping duplicate bytes in runs.
4823    // For `tr -s ' '` on 10MB with ~5K squeeze points:
4824    //   - ~10K IoSlice entries (one per gap + one per squeeze point)
4825    //   - ~10 writev syscalls (at 1024 entries per batch)
4826    //   - Zero data copy — kernel reads directly from mmap pages
4827    let finder = memchr::memmem::Finder::new(&pair);
4828    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
4829    let mut cursor = 0;
4830
4831    while cursor < data.len() {
4832        match finder.find(&data[cursor..]) {
4833            Some(offset) => {
4834                let pair_pos = cursor + offset;
4835                // Include everything up to and including the first byte of the pair
4836                let seg_end = pair_pos + 1;
4837                if seg_end > cursor {
4838                    iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
4839                }
4840                // Skip all remaining consecutive ch bytes (the run)
4841                let mut skip = seg_end;
4842                while skip < data.len() && data[skip] == ch {
4843                    skip += 1;
4844                }
4845                cursor = skip;
4846                // Flush when approaching MAX_IOV
4847                if iov.len() >= MAX_IOV {
4848                    write_ioslices(writer, &iov)?;
4849                    iov.clear();
4850                }
4851            }
4852            None => {
4853                // No more pairs — emit remainder
4854                if cursor < data.len() {
4855                    iov.push(std::io::IoSlice::new(&data[cursor..]));
4856                }
4857                break;
4858            }
4859        }
4860    }
4861
4862    if !iov.is_empty() {
4863        write_ioslices(writer, &iov)?;
4864    }
4865    Ok(())
4866}