Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Stream buffer: 16MB — sized to accept the largest single pipe read
10/// (typically 8MB with F_SETPIPE_SZ). Each read chunk is processed and
11/// written immediately for pipelining: while ftr processes chunk N,
12/// upstream cat writes chunk N+1 to the pipe.
13const STREAM_BUF: usize = 16 * 1024 * 1024;
14
15/// Minimum data size to engage rayon parallel processing for mmap paths.
16/// AVX2 translation runs at ~10 GB/s per core. For 10MB data:
17/// - Sequential: ~1ms translate
18/// - Parallel (4 cores): ~0.25ms translate + ~0.15ms rayon overhead = ~0.4ms
19/// Net savings: ~0.6ms per translate pass. Worth it for >= 4MB files where
20/// the multi-core speedup clearly exceeds rayon spawn+join overhead.
21const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
22
23/// 256-entry lookup table for byte compaction: for each 8-bit keep mask,
24/// stores the bit positions of set bits (indices of bytes to keep).
25/// Used by compact_8bytes to replace the serial trailing_zeros loop with
26/// unconditional indexed stores, eliminating the tzcnt→blsr dependency chain.
27/// Total size: 256 * 8 = 2KB — fits entirely in L1 cache.
28#[cfg(target_arch = "x86_64")]
29static COMPACT_LUT: [[u8; 8]; 256] = {
30    let mut lut = [[0u8; 8]; 256];
31    let mut mask: u16 = 0;
32    while mask < 256 {
33        let mut idx: usize = 0;
34        let mut bit: u8 = 0;
35        while bit < 8 {
36            if (mask >> bit) & 1 != 0 {
37                lut[mask as usize][idx] = bit;
38                idx += 1;
39            }
40            bit += 1;
41        }
42        mask += 1;
43    }
44    lut
45};
46
47/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
48/// Falls back to write_all per slice for partial writes.
49#[inline]
50fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
51    if slices.is_empty() {
52        return Ok(());
53    }
54    for batch in slices.chunks(MAX_IOV) {
55        let total: usize = batch.iter().map(|s| s.len()).sum();
56        match writer.write_vectored(batch) {
57            Ok(n) if n >= total => continue,
58            Ok(mut written) => {
59                // Partial write: fall back to write_all per remaining slice
60                for slice in batch {
61                    let slen = slice.len();
62                    if written >= slen {
63                        written -= slen;
64                        continue;
65                    }
66                    if written > 0 {
67                        writer.write_all(&slice[written..])?;
68                        written = 0;
69                    } else {
70                        writer.write_all(slice)?;
71                    }
72                }
73            }
74            Err(e) => return Err(e),
75        }
76    }
77    Ok(())
78}
79
80/// Allocate a Vec<u8> of given length without zero-initialization.
81/// Uses MADV_HUGEPAGE on Linux for buffers >= 2MB to reduce TLB misses.
82/// SAFETY: Caller must write all bytes before reading them.
83#[inline]
84#[allow(clippy::uninit_vec)]
85fn alloc_uninit_vec(len: usize) -> Vec<u8> {
86    let mut v = Vec::with_capacity(len);
87    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
88    unsafe {
89        v.set_len(len);
90    }
91    #[cfg(target_os = "linux")]
92    if len >= 2 * 1024 * 1024 {
93        unsafe {
94            libc::madvise(
95                v.as_mut_ptr() as *mut libc::c_void,
96                len,
97                libc::MADV_HUGEPAGE,
98            );
99        }
100    }
101    v
102}
103
104/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
105#[inline]
106fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
107    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
108    let last = set2.last().copied();
109    for (i, &from) in set1.iter().enumerate() {
110        table[from as usize] = if i < set2.len() {
111            set2[i]
112        } else {
113            last.unwrap_or(from)
114        };
115    }
116    table
117}
118
119/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
120#[inline]
121fn build_member_set(chars: &[u8]) -> [u8; 32] {
122    let mut set = [0u8; 32];
123    for &ch in chars {
124        set[ch as usize >> 3] |= 1 << (ch & 7);
125    }
126    set
127}
128
129#[inline(always)]
130fn is_member(set: &[u8; 32], ch: u8) -> bool {
131    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
132}
133
134/// Cached SIMD capability level for x86_64.
135/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
136#[cfg(target_arch = "x86_64")]
137static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
138
139#[cfg(target_arch = "x86_64")]
140#[inline(always)]
141fn get_simd_level() -> u8 {
142    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
143    if level != 0 {
144        return level;
145    }
146    let detected = if is_x86_feature_detected!("avx2") {
147        3
148    } else if is_x86_feature_detected!("ssse3") {
149        2
150    } else {
151        1
152    };
153    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
154    detected
155}
156
157/// Count how many entries in the translate table are non-identity.
158#[cfg(target_arch = "x86_64")]
159#[inline]
160fn count_non_identity(table: &[u8; 256]) -> usize {
161    table
162        .iter()
163        .enumerate()
164        .filter(|&(i, &v)| v != i as u8)
165        .count()
166}
167
168/// Translate bytes in-place using a 256-byte lookup table.
169/// For sparse translations (few bytes change), uses SIMD skip-ahead:
170/// compare 32 bytes at a time against identity, skip unchanged chunks.
171/// For dense translations, uses full SIMD nibble decomposition.
172/// Falls back to 8x-unrolled scalar on non-x86_64 platforms.
173#[inline(always)]
174fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
175    #[cfg(target_arch = "x86_64")]
176    {
177        let level = get_simd_level();
178        if level >= 3 {
179            // For sparse translations (<=16 non-identity entries), the skip-ahead
180            // approach is faster: load 32 bytes, do a full nibble lookup, compare
181            // against input, skip store if identical. This avoids writing to pages
182            // that don't change (important for MAP_PRIVATE COW mmap).
183            let non_id = count_non_identity(table);
184            if non_id > 0 && non_id <= 16 {
185                unsafe { translate_inplace_avx2_sparse(data, table) };
186                return;
187            }
188            unsafe { translate_inplace_avx2_table(data, table) };
189            return;
190        }
191        if level >= 2 {
192            unsafe { translate_inplace_ssse3_table(data, table) };
193            return;
194        }
195    }
196    translate_inplace_scalar(data, table);
197}
198
199/// Sparse AVX2 translate: skip unchanged 32-byte chunks.
200/// For each chunk: perform full nibble lookup, compare result vs input.
201/// If identical (no bytes changed), skip the store entirely.
202/// This reduces memory bandwidth and avoids COW page faults for
203/// MAP_PRIVATE mmaps when most bytes are unchanged.
204#[cfg(target_arch = "x86_64")]
205#[target_feature(enable = "avx2")]
206unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
207    use std::arch::x86_64::*;
208
209    unsafe {
210        let len = data.len();
211        let ptr = data.as_mut_ptr();
212
213        // Pre-build 16 lookup vectors (same as full nibble decomposition)
214        let mut lut = [_mm256_setzero_si256(); 16];
215        for h in 0u8..16 {
216            let base = (h as usize) * 16;
217            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
218            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
219            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
220        }
221
222        let lo_mask = _mm256_set1_epi8(0x0F);
223
224        let mut i = 0;
225        while i + 32 <= len {
226            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
227            let lo_nibble = _mm256_and_si256(input, lo_mask);
228            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
229
230            let mut result = _mm256_setzero_si256();
231            macro_rules! do_nibble {
232                ($h:expr) => {
233                    let h_val = _mm256_set1_epi8($h as i8);
234                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
235                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
236                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
237                };
238            }
239            do_nibble!(0);
240            do_nibble!(1);
241            do_nibble!(2);
242            do_nibble!(3);
243            do_nibble!(4);
244            do_nibble!(5);
245            do_nibble!(6);
246            do_nibble!(7);
247            do_nibble!(8);
248            do_nibble!(9);
249            do_nibble!(10);
250            do_nibble!(11);
251            do_nibble!(12);
252            do_nibble!(13);
253            do_nibble!(14);
254            do_nibble!(15);
255
256            // Only store if result differs from input (skip unchanged chunks)
257            let diff = _mm256_xor_si256(input, result);
258            if _mm256_testz_si256(diff, diff) == 0 {
259                _mm256_storeu_si256(ptr.add(i) as *mut _, result);
260            }
261            i += 32;
262        }
263
264        // Scalar tail
265        while i < len {
266            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
267            i += 1;
268        }
269    }
270}
271
272/// Scalar fallback: 8x-unrolled table lookup.
273#[cfg(not(target_arch = "aarch64"))]
274#[inline(always)]
275fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
276    let len = data.len();
277    let ptr = data.as_mut_ptr();
278    let mut i = 0;
279    unsafe {
280        while i + 8 <= len {
281            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
282            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
283            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
284            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
285            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
286            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
287            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
288            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
289            i += 8;
290        }
291        while i < len {
292            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
293            i += 1;
294        }
295    }
296}
297
298/// ARM64 NEON table lookup using nibble decomposition (same algorithm as x86 pshufb).
299/// Uses vqtbl1q_u8 for 16-byte table lookups, processes 16 bytes per iteration.
300#[cfg(target_arch = "aarch64")]
301#[inline(always)]
302fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
303    unsafe { translate_inplace_neon_table(data, table) };
304}
305
306#[cfg(target_arch = "aarch64")]
307#[target_feature(enable = "neon")]
308unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
309    use std::arch::aarch64::*;
310
311    unsafe {
312        let len = data.len();
313        let ptr = data.as_mut_ptr();
314
315        // Pre-build 16 NEON lookup vectors (one per high nibble)
316        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
317        for h in 0u8..16 {
318            let base = (h as usize) * 16;
319            lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
320        }
321
322        let lo_mask = vdupq_n_u8(0x0F);
323        let mut i = 0;
324
325        while i + 16 <= len {
326            let input = vld1q_u8(ptr.add(i));
327            let lo_nibble = vandq_u8(input, lo_mask);
328            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
329
330            let mut result = vdupq_n_u8(0);
331            macro_rules! do_nibble {
332                ($h:expr) => {
333                    let h_val = vdupq_n_u8($h);
334                    let mask = vceqq_u8(hi_nibble, h_val);
335                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
336                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
337                };
338            }
339            do_nibble!(0);
340            do_nibble!(1);
341            do_nibble!(2);
342            do_nibble!(3);
343            do_nibble!(4);
344            do_nibble!(5);
345            do_nibble!(6);
346            do_nibble!(7);
347            do_nibble!(8);
348            do_nibble!(9);
349            do_nibble!(10);
350            do_nibble!(11);
351            do_nibble!(12);
352            do_nibble!(13);
353            do_nibble!(14);
354            do_nibble!(15);
355
356            vst1q_u8(ptr.add(i), result);
357            i += 16;
358        }
359
360        // Scalar tail
361        while i < len {
362            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
363            i += 1;
364        }
365    }
366}
367
368// ============================================================================
369// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
370// ============================================================================
371//
372// For an arbitrary 256-byte lookup table, we decompose each byte into
373// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
374// SIMD vectors, one for each high nibble value h (0..15), containing
375// the 16 table entries table[h*16+0..h*16+15]. Then for each input
376// vector we:
377//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
378//   2. Extract high nibble (shift right 4) -> used to select which table
379//   3. For each of the 16 high nibble values, create a mask where
380//      the high nibble equals that value, pshufb the corresponding
381//      table, and accumulate results
382//
383// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
384// With instruction-level parallelism, this achieves much higher throughput
385// than scalar table lookups which have serial data dependencies.
386
387#[cfg(target_arch = "x86_64")]
388#[target_feature(enable = "avx2")]
389unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
390    use std::arch::x86_64::*;
391
392    unsafe {
393        let len = data.len();
394        let ptr = data.as_mut_ptr();
395
396        // Pre-build 16 lookup vectors, one per high nibble value.
397        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
398        // 16 table entries for pshufb indexing by low nibble.
399        let mut lut = [_mm256_setzero_si256(); 16];
400        for h in 0u8..16 {
401            let base = (h as usize) * 16;
402            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
403            // Broadcast the 128-bit row to both lanes of the 256-bit vector
404            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
405            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
406        }
407
408        let lo_mask = _mm256_set1_epi8(0x0F);
409
410        let mut i = 0;
411
412        // 2x unrolled: process 64 bytes (2x32) per iteration for better ILP.
413        // The CPU can overlap load/compute of the second vector while the first
414        // is in the nibble decomposition pipeline.
415        while i + 64 <= len {
416            let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
417            let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
418
419            let lo0 = _mm256_and_si256(input0, lo_mask);
420            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
421            let lo1 = _mm256_and_si256(input1, lo_mask);
422            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
423
424            let mut r0 = _mm256_setzero_si256();
425            let mut r1 = _mm256_setzero_si256();
426
427            macro_rules! do_nibble2 {
428                ($h:expr) => {
429                    let h_val = _mm256_set1_epi8($h as i8);
430                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
431                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
432                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
433                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
434                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
435                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
436                };
437            }
438            do_nibble2!(0);
439            do_nibble2!(1);
440            do_nibble2!(2);
441            do_nibble2!(3);
442            do_nibble2!(4);
443            do_nibble2!(5);
444            do_nibble2!(6);
445            do_nibble2!(7);
446            do_nibble2!(8);
447            do_nibble2!(9);
448            do_nibble2!(10);
449            do_nibble2!(11);
450            do_nibble2!(12);
451            do_nibble2!(13);
452            do_nibble2!(14);
453            do_nibble2!(15);
454
455            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
456            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
457            i += 64;
458        }
459
460        // Remaining 32-byte chunk
461        if i + 32 <= len {
462            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
463            let lo_nibble = _mm256_and_si256(input, lo_mask);
464            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
465
466            let mut result = _mm256_setzero_si256();
467
468            macro_rules! do_nibble {
469                ($h:expr) => {
470                    let h_val = _mm256_set1_epi8($h as i8);
471                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
472                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
473                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
474                };
475            }
476            do_nibble!(0);
477            do_nibble!(1);
478            do_nibble!(2);
479            do_nibble!(3);
480            do_nibble!(4);
481            do_nibble!(5);
482            do_nibble!(6);
483            do_nibble!(7);
484            do_nibble!(8);
485            do_nibble!(9);
486            do_nibble!(10);
487            do_nibble!(11);
488            do_nibble!(12);
489            do_nibble!(13);
490            do_nibble!(14);
491            do_nibble!(15);
492
493            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
494            i += 32;
495        }
496
497        // SSE/SSSE3 tail for remaining 16-byte chunk
498        if i + 16 <= len {
499            let lo_mask128 = _mm_set1_epi8(0x0F);
500
501            let mut lut128 = [_mm_setzero_si128(); 16];
502            for h in 0u8..16 {
503                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
504            }
505
506            let input = _mm_loadu_si128(ptr.add(i) as *const _);
507            let lo_nib = _mm_and_si128(input, lo_mask128);
508            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
509
510            let mut res = _mm_setzero_si128();
511            macro_rules! do_nibble128 {
512                ($h:expr) => {
513                    let h_val = _mm_set1_epi8($h as i8);
514                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
515                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
516                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
517                };
518            }
519            do_nibble128!(0);
520            do_nibble128!(1);
521            do_nibble128!(2);
522            do_nibble128!(3);
523            do_nibble128!(4);
524            do_nibble128!(5);
525            do_nibble128!(6);
526            do_nibble128!(7);
527            do_nibble128!(8);
528            do_nibble128!(9);
529            do_nibble128!(10);
530            do_nibble128!(11);
531            do_nibble128!(12);
532            do_nibble128!(13);
533            do_nibble128!(14);
534            do_nibble128!(15);
535
536            _mm_storeu_si128(ptr.add(i) as *mut _, res);
537            i += 16;
538        }
539
540        // Scalar tail
541        while i < len {
542            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
543            i += 1;
544        }
545    }
546}
547
548#[cfg(target_arch = "x86_64")]
549#[target_feature(enable = "ssse3")]
550unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
551    use std::arch::x86_64::*;
552
553    unsafe {
554        let len = data.len();
555        let ptr = data.as_mut_ptr();
556
557        // Pre-build 16 lookup vectors for pshufb
558        let mut lut = [_mm_setzero_si128(); 16];
559        for h in 0u8..16 {
560            let base = (h as usize) * 16;
561            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
562            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
563        }
564
565        let lo_mask = _mm_set1_epi8(0x0F);
566
567        let mut i = 0;
568        while i + 16 <= len {
569            let input = _mm_loadu_si128(ptr.add(i) as *const _);
570            let lo_nibble = _mm_and_si128(input, lo_mask);
571            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
572
573            let mut result = _mm_setzero_si128();
574
575            macro_rules! do_nibble {
576                ($h:expr) => {
577                    let h_val = _mm_set1_epi8($h as i8);
578                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
579                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
580                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
581                };
582            }
583            do_nibble!(0);
584            do_nibble!(1);
585            do_nibble!(2);
586            do_nibble!(3);
587            do_nibble!(4);
588            do_nibble!(5);
589            do_nibble!(6);
590            do_nibble!(7);
591            do_nibble!(8);
592            do_nibble!(9);
593            do_nibble!(10);
594            do_nibble!(11);
595            do_nibble!(12);
596            do_nibble!(13);
597            do_nibble!(14);
598            do_nibble!(15);
599
600            _mm_storeu_si128(ptr.add(i) as *mut _, result);
601            i += 16;
602        }
603
604        // Scalar tail
605        while i < len {
606            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
607            i += 1;
608        }
609    }
610}
611
612/// Translate bytes from source to destination using a 256-byte lookup table.
613/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
614#[inline(always)]
615fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
616    debug_assert!(dst.len() >= src.len());
617    #[cfg(target_arch = "x86_64")]
618    {
619        let level = get_simd_level();
620        if level >= 3 {
621            // Use nontemporal stores when dst is 32-byte aligned (large Vec allocations)
622            if dst.as_ptr() as usize & 31 == 0 {
623                unsafe { translate_to_avx2_table_nt(src, dst, table) };
624            } else {
625                unsafe { translate_to_avx2_table(src, dst, table) };
626            }
627            return;
628        }
629        if level >= 2 {
630            unsafe { translate_to_ssse3_table(src, dst, table) };
631            return;
632        }
633    }
634    translate_to_scalar(src, dst, table);
635}
636
637/// Scalar fallback for translate_to.
638#[cfg(not(target_arch = "aarch64"))]
639#[inline(always)]
640fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
641    unsafe {
642        let sp = src.as_ptr();
643        let dp = dst.as_mut_ptr();
644        let len = src.len();
645        let mut i = 0;
646        while i + 8 <= len {
647            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
648            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
649            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
650            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
651            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
652            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
653            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
654            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
655            i += 8;
656        }
657        while i < len {
658            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
659            i += 1;
660        }
661    }
662}
663
664/// ARM64 NEON table-lookup translate_to using nibble decomposition.
665#[cfg(target_arch = "aarch64")]
666#[inline(always)]
667fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
668    unsafe { translate_to_neon_table(src, dst, table) };
669}
670
671#[cfg(target_arch = "aarch64")]
672#[target_feature(enable = "neon")]
673unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
674    use std::arch::aarch64::*;
675
676    unsafe {
677        let len = src.len();
678        let sp = src.as_ptr();
679        let dp = dst.as_mut_ptr();
680
681        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
682        for h in 0u8..16 {
683            lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
684        }
685
686        let lo_mask = vdupq_n_u8(0x0F);
687        let mut i = 0;
688
689        while i + 16 <= len {
690            let input = vld1q_u8(sp.add(i));
691            let lo_nibble = vandq_u8(input, lo_mask);
692            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
693
694            let mut result = vdupq_n_u8(0);
695            macro_rules! do_nibble {
696                ($h:expr) => {
697                    let h_val = vdupq_n_u8($h);
698                    let mask = vceqq_u8(hi_nibble, h_val);
699                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
700                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
701                };
702            }
703            do_nibble!(0);
704            do_nibble!(1);
705            do_nibble!(2);
706            do_nibble!(3);
707            do_nibble!(4);
708            do_nibble!(5);
709            do_nibble!(6);
710            do_nibble!(7);
711            do_nibble!(8);
712            do_nibble!(9);
713            do_nibble!(10);
714            do_nibble!(11);
715            do_nibble!(12);
716            do_nibble!(13);
717            do_nibble!(14);
718            do_nibble!(15);
719
720            vst1q_u8(dp.add(i), result);
721            i += 16;
722        }
723
724        while i < len {
725            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
726            i += 1;
727        }
728    }
729}
730
731#[cfg(target_arch = "x86_64")]
732#[target_feature(enable = "avx2")]
733unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
734    use std::arch::x86_64::*;
735
736    unsafe {
737        let len = src.len();
738        let sp = src.as_ptr();
739        let dp = dst.as_mut_ptr();
740
741        // Pre-build 16 lookup vectors
742        let mut lut = [_mm256_setzero_si256(); 16];
743        for h in 0u8..16 {
744            let base = (h as usize) * 16;
745            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
746            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
747            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
748        }
749
750        let lo_mask = _mm256_set1_epi8(0x0F);
751
752        let mut i = 0;
753
754        // 2x unrolled: process 64 bytes per iteration for better ILP
755        while i + 64 <= len {
756            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
757            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
758
759            let lo0 = _mm256_and_si256(input0, lo_mask);
760            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
761            let lo1 = _mm256_and_si256(input1, lo_mask);
762            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
763
764            let mut r0 = _mm256_setzero_si256();
765            let mut r1 = _mm256_setzero_si256();
766
767            macro_rules! do_nibble2 {
768                ($h:expr) => {
769                    let h_val = _mm256_set1_epi8($h as i8);
770                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
771                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
772                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
773                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
774                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
775                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
776                };
777            }
778            do_nibble2!(0);
779            do_nibble2!(1);
780            do_nibble2!(2);
781            do_nibble2!(3);
782            do_nibble2!(4);
783            do_nibble2!(5);
784            do_nibble2!(6);
785            do_nibble2!(7);
786            do_nibble2!(8);
787            do_nibble2!(9);
788            do_nibble2!(10);
789            do_nibble2!(11);
790            do_nibble2!(12);
791            do_nibble2!(13);
792            do_nibble2!(14);
793            do_nibble2!(15);
794
795            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
796            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
797            i += 64;
798        }
799
800        // Remaining 32-byte chunk
801        if i + 32 <= len {
802            let input = _mm256_loadu_si256(sp.add(i) as *const _);
803            let lo_nibble = _mm256_and_si256(input, lo_mask);
804            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
805
806            let mut result = _mm256_setzero_si256();
807
808            macro_rules! do_nibble {
809                ($h:expr) => {
810                    let h_val = _mm256_set1_epi8($h as i8);
811                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
812                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
813                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
814                };
815            }
816            do_nibble!(0);
817            do_nibble!(1);
818            do_nibble!(2);
819            do_nibble!(3);
820            do_nibble!(4);
821            do_nibble!(5);
822            do_nibble!(6);
823            do_nibble!(7);
824            do_nibble!(8);
825            do_nibble!(9);
826            do_nibble!(10);
827            do_nibble!(11);
828            do_nibble!(12);
829            do_nibble!(13);
830            do_nibble!(14);
831            do_nibble!(15);
832
833            _mm256_storeu_si256(dp.add(i) as *mut _, result);
834            i += 32;
835        }
836
837        // SSSE3 tail for remaining 16-byte chunk
838        if i + 16 <= len {
839            let lo_mask128 = _mm_set1_epi8(0x0F);
840            let mut lut128 = [_mm_setzero_si128(); 16];
841            for h in 0u8..16 {
842                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
843            }
844
845            let input = _mm_loadu_si128(sp.add(i) as *const _);
846            let lo_nib = _mm_and_si128(input, lo_mask128);
847            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
848
849            let mut res = _mm_setzero_si128();
850            macro_rules! do_nibble128 {
851                ($h:expr) => {
852                    let h_val = _mm_set1_epi8($h as i8);
853                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
854                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
855                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
856                };
857            }
858            do_nibble128!(0);
859            do_nibble128!(1);
860            do_nibble128!(2);
861            do_nibble128!(3);
862            do_nibble128!(4);
863            do_nibble128!(5);
864            do_nibble128!(6);
865            do_nibble128!(7);
866            do_nibble128!(8);
867            do_nibble128!(9);
868            do_nibble128!(10);
869            do_nibble128!(11);
870            do_nibble128!(12);
871            do_nibble128!(13);
872            do_nibble128!(14);
873            do_nibble128!(15);
874
875            _mm_storeu_si128(dp.add(i) as *mut _, res);
876            i += 16;
877        }
878
879        // Scalar tail
880        while i < len {
881            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
882            i += 1;
883        }
884    }
885}
886
887/// Nontemporal variant of translate_to_avx2_table: uses _mm256_stream_si256 for stores.
888/// Avoids RFO cache traffic for the destination buffer in streaming translate operations.
889#[cfg(target_arch = "x86_64")]
890#[target_feature(enable = "avx2")]
891unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
892    use std::arch::x86_64::*;
893
894    unsafe {
895        let len = src.len();
896        let sp = src.as_ptr();
897        let dp = dst.as_mut_ptr();
898
899        // Pre-build 16 lookup vectors
900        let mut lut = [_mm256_setzero_si256(); 16];
901        for h in 0u8..16 {
902            let base = (h as usize) * 16;
903            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
904            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
905            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
906        }
907
908        let lo_mask = _mm256_set1_epi8(0x0F);
909        let mut i = 0;
910
911        // 2x unrolled with nontemporal stores
912        while i + 64 <= len {
913            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
914            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
915
916            let lo0 = _mm256_and_si256(input0, lo_mask);
917            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
918            let lo1 = _mm256_and_si256(input1, lo_mask);
919            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
920
921            let mut r0 = _mm256_setzero_si256();
922            let mut r1 = _mm256_setzero_si256();
923
924            macro_rules! do_nibble2 {
925                ($h:expr) => {
926                    let h_val = _mm256_set1_epi8($h as i8);
927                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
928                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
929                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
930                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
931                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
932                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
933                };
934            }
935            do_nibble2!(0);
936            do_nibble2!(1);
937            do_nibble2!(2);
938            do_nibble2!(3);
939            do_nibble2!(4);
940            do_nibble2!(5);
941            do_nibble2!(6);
942            do_nibble2!(7);
943            do_nibble2!(8);
944            do_nibble2!(9);
945            do_nibble2!(10);
946            do_nibble2!(11);
947            do_nibble2!(12);
948            do_nibble2!(13);
949            do_nibble2!(14);
950            do_nibble2!(15);
951
952            _mm256_stream_si256(dp.add(i) as *mut _, r0);
953            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
954            i += 64;
955        }
956
957        // Remaining 32-byte chunk
958        if i + 32 <= len {
959            let input = _mm256_loadu_si256(sp.add(i) as *const _);
960            let lo_nibble = _mm256_and_si256(input, lo_mask);
961            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
962
963            let mut result = _mm256_setzero_si256();
964            macro_rules! do_nibble {
965                ($h:expr) => {
966                    let h_val = _mm256_set1_epi8($h as i8);
967                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
968                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
969                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
970                };
971            }
972            do_nibble!(0);
973            do_nibble!(1);
974            do_nibble!(2);
975            do_nibble!(3);
976            do_nibble!(4);
977            do_nibble!(5);
978            do_nibble!(6);
979            do_nibble!(7);
980            do_nibble!(8);
981            do_nibble!(9);
982            do_nibble!(10);
983            do_nibble!(11);
984            do_nibble!(12);
985            do_nibble!(13);
986            do_nibble!(14);
987            do_nibble!(15);
988
989            _mm256_stream_si256(dp.add(i) as *mut _, result);
990            i += 32;
991        }
992
993        // SSSE3 tail for remaining 16-byte chunk (regular store)
994        if i + 16 <= len {
995            let lo_mask128 = _mm_set1_epi8(0x0F);
996            let mut lut128 = [_mm_setzero_si128(); 16];
997            for h in 0u8..16 {
998                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
999            }
1000
1001            let input = _mm_loadu_si128(sp.add(i) as *const _);
1002            let lo_nib = _mm_and_si128(input, lo_mask128);
1003            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1004
1005            let mut res = _mm_setzero_si128();
1006            macro_rules! do_nibble128 {
1007                ($h:expr) => {
1008                    let h_val = _mm_set1_epi8($h as i8);
1009                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1010                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1011                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1012                };
1013            }
1014            do_nibble128!(0);
1015            do_nibble128!(1);
1016            do_nibble128!(2);
1017            do_nibble128!(3);
1018            do_nibble128!(4);
1019            do_nibble128!(5);
1020            do_nibble128!(6);
1021            do_nibble128!(7);
1022            do_nibble128!(8);
1023            do_nibble128!(9);
1024            do_nibble128!(10);
1025            do_nibble128!(11);
1026            do_nibble128!(12);
1027            do_nibble128!(13);
1028            do_nibble128!(14);
1029            do_nibble128!(15);
1030
1031            _mm_storeu_si128(dp.add(i) as *mut _, res);
1032            i += 16;
1033        }
1034
1035        // Scalar tail
1036        while i < len {
1037            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1038            i += 1;
1039        }
1040
1041        // Fence: ensure nontemporal stores are visible before write() syscall
1042        _mm_sfence();
1043    }
1044}
1045
1046#[cfg(target_arch = "x86_64")]
1047#[target_feature(enable = "ssse3")]
1048unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1049    use std::arch::x86_64::*;
1050
1051    unsafe {
1052        let len = src.len();
1053        let sp = src.as_ptr();
1054        let dp = dst.as_mut_ptr();
1055
1056        let mut lut = [_mm_setzero_si128(); 16];
1057        for h in 0u8..16 {
1058            let base = (h as usize) * 16;
1059            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1060            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1061        }
1062
1063        let lo_mask = _mm_set1_epi8(0x0F);
1064
1065        let mut i = 0;
1066        while i + 16 <= len {
1067            let input = _mm_loadu_si128(sp.add(i) as *const _);
1068            let lo_nibble = _mm_and_si128(input, lo_mask);
1069            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1070
1071            let mut result = _mm_setzero_si128();
1072
1073            macro_rules! do_nibble {
1074                ($h:expr) => {
1075                    let h_val = _mm_set1_epi8($h as i8);
1076                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1077                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1078                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1079                };
1080            }
1081            do_nibble!(0);
1082            do_nibble!(1);
1083            do_nibble!(2);
1084            do_nibble!(3);
1085            do_nibble!(4);
1086            do_nibble!(5);
1087            do_nibble!(6);
1088            do_nibble!(7);
1089            do_nibble!(8);
1090            do_nibble!(9);
1091            do_nibble!(10);
1092            do_nibble!(11);
1093            do_nibble!(12);
1094            do_nibble!(13);
1095            do_nibble!(14);
1096            do_nibble!(15);
1097
1098            _mm_storeu_si128(dp.add(i) as *mut _, result);
1099            i += 16;
1100        }
1101
1102        // Scalar tail
1103        while i < len {
1104            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1105            i += 1;
1106        }
1107    }
1108}
1109
1110// ============================================================================
1111// SIMD range translation (x86_64)
1112// ============================================================================
1113
1114/// Detect if the translate table is a single contiguous range with constant offset.
1115/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
1116/// table[i] = i + offset for all i in [lo, hi].
1117#[inline]
1118fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1119    let mut lo: Option<u8> = None;
1120    let mut hi = 0u8;
1121    let mut offset = 0i16;
1122
1123    for i in 0..256 {
1124        if table[i] != i as u8 {
1125            let diff = table[i] as i16 - i as i16;
1126            match lo {
1127                None => {
1128                    lo = Some(i as u8);
1129                    hi = i as u8;
1130                    offset = diff;
1131                }
1132                Some(_) => {
1133                    if diff != offset || i as u8 != hi.wrapping_add(1) {
1134                        return None;
1135                    }
1136                    hi = i as u8;
1137                }
1138            }
1139        }
1140    }
1141
1142    lo.map(|l| (l, hi, offset as i8))
1143}
1144
1145/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
1146/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
1147/// a range maps to one replacement character.
1148/// Returns Some((lo, hi, replacement)) if the pattern matches.
1149#[inline]
1150fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1151    let mut lo: Option<u8> = None;
1152    let mut hi = 0u8;
1153    let mut replacement = 0u8;
1154
1155    for i in 0..256 {
1156        if table[i] != i as u8 {
1157            match lo {
1158                None => {
1159                    lo = Some(i as u8);
1160                    hi = i as u8;
1161                    replacement = table[i];
1162                }
1163                Some(_) => {
1164                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1165                        return None;
1166                    }
1167                    hi = i as u8;
1168                }
1169            }
1170        }
1171    }
1172
1173    lo.map(|l| (l, hi, replacement))
1174}
1175
1176/// SIMD-accelerated range-to-constant translation.
1177/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
1178/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
1179/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
1180#[cfg(target_arch = "x86_64")]
1181fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1182    if get_simd_level() >= 3 {
1183        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1184    } else {
1185        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1186    }
1187}
1188
1189#[cfg(target_arch = "x86_64")]
1190#[target_feature(enable = "avx2")]
1191unsafe fn translate_range_to_constant_avx2_inplace(
1192    data: &mut [u8],
1193    lo: u8,
1194    hi: u8,
1195    replacement: u8,
1196) {
1197    use std::arch::x86_64::*;
1198
1199    unsafe {
1200        let range = hi - lo;
1201        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1202        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1203        let repl_v = _mm256_set1_epi8(replacement as i8);
1204        let zero = _mm256_setzero_si256();
1205
1206        let len = data.len();
1207        let ptr = data.as_mut_ptr();
1208        let mut i = 0;
1209
1210        // 2x unrolled: process 64 bytes per iteration for better ILP
1211        while i + 64 <= len {
1212            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1213            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1214            let bi0 = _mm256_add_epi8(in0, bias_v);
1215            let bi1 = _mm256_add_epi8(in1, bias_v);
1216            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1217            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1218            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1219            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1220            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1221            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1222            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1223            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1224            i += 64;
1225        }
1226
1227        // Remaining 32-byte chunk
1228        if i + 32 <= len {
1229            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1230            let biased = _mm256_add_epi8(input, bias_v);
1231            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1232            let in_range = _mm256_cmpeq_epi8(gt, zero);
1233            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1234            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1235            i += 32;
1236        }
1237
1238        if i + 16 <= len {
1239            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1240            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1241            let repl_v128 = _mm_set1_epi8(replacement as i8);
1242            let zero128 = _mm_setzero_si128();
1243
1244            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1245            let biased = _mm_add_epi8(input, bias_v128);
1246            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1247            let in_range = _mm_cmpeq_epi8(gt, zero128);
1248            let result = _mm_blendv_epi8(input, repl_v128, in_range);
1249            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1250            i += 16;
1251        }
1252
1253        while i < len {
1254            let b = *ptr.add(i);
1255            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1256            i += 1;
1257        }
1258    }
1259}
1260
1261#[cfg(target_arch = "x86_64")]
1262#[target_feature(enable = "sse2")]
1263unsafe fn translate_range_to_constant_sse2_inplace(
1264    data: &mut [u8],
1265    lo: u8,
1266    hi: u8,
1267    replacement: u8,
1268) {
1269    use std::arch::x86_64::*;
1270
1271    unsafe {
1272        let range = hi - lo;
1273        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1274        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1275        let repl_v = _mm_set1_epi8(replacement as i8);
1276        let zero = _mm_setzero_si128();
1277
1278        let len = data.len();
1279        let ptr = data.as_mut_ptr();
1280        let mut i = 0;
1281
1282        while i + 16 <= len {
1283            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1284            let biased = _mm_add_epi8(input, bias_v);
1285            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1286            // in_range mask: 0xFF where in range, 0x00 where not
1287            let in_range = _mm_cmpeq_epi8(gt, zero);
1288            // SSE2 blendv: (repl & mask) | (input & ~mask)
1289            let result = _mm_or_si128(
1290                _mm_and_si128(in_range, repl_v),
1291                _mm_andnot_si128(in_range, input),
1292            );
1293            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1294            i += 16;
1295        }
1296
1297        while i < len {
1298            let b = *ptr.add(i);
1299            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1300            i += 1;
1301        }
1302    }
1303}
1304
1305#[cfg(target_arch = "aarch64")]
1306fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1307    unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1308}
1309
1310#[cfg(target_arch = "aarch64")]
1311#[target_feature(enable = "neon")]
1312unsafe fn translate_range_to_constant_neon_inplace(
1313    data: &mut [u8],
1314    lo: u8,
1315    hi: u8,
1316    replacement: u8,
1317) {
1318    use std::arch::aarch64::*;
1319
1320    unsafe {
1321        let len = data.len();
1322        let ptr = data.as_mut_ptr();
1323        let lo_v = vdupq_n_u8(lo);
1324        let hi_v = vdupq_n_u8(hi);
1325        let repl_v = vdupq_n_u8(replacement);
1326        let mut i = 0;
1327
1328        while i + 32 <= len {
1329            let in0 = vld1q_u8(ptr.add(i));
1330            let in1 = vld1q_u8(ptr.add(i + 16));
1331            let ge0 = vcgeq_u8(in0, lo_v);
1332            let le0 = vcleq_u8(in0, hi_v);
1333            let mask0 = vandq_u8(ge0, le0);
1334            let ge1 = vcgeq_u8(in1, lo_v);
1335            let le1 = vcleq_u8(in1, hi_v);
1336            let mask1 = vandq_u8(ge1, le1);
1337            // bsl: select repl where mask, keep input where not
1338            vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1339            vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1340            i += 32;
1341        }
1342
1343        if i + 16 <= len {
1344            let input = vld1q_u8(ptr.add(i));
1345            let ge = vcgeq_u8(input, lo_v);
1346            let le = vcleq_u8(input, hi_v);
1347            let mask = vandq_u8(ge, le);
1348            vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1349            i += 16;
1350        }
1351
1352        while i < len {
1353            let b = *ptr.add(i);
1354            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1355            i += 1;
1356        }
1357    }
1358}
1359
1360#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1361fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1362    for b in data.iter_mut() {
1363        if *b >= lo && *b <= hi {
1364            *b = replacement;
1365        }
1366    }
1367}
1368
1369/// SIMD range-to-constant translation from src to dst (no intermediate copy needed).
1370/// Reads from src, writes translated result to dst in a single pass.
1371#[cfg(target_arch = "x86_64")]
1372fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1373    if get_simd_level() >= 3 {
1374        unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1375    } else {
1376        unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1377    }
1378}
1379
1380#[cfg(target_arch = "aarch64")]
1381fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1382    unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1383}
1384
1385#[cfg(target_arch = "aarch64")]
1386#[target_feature(enable = "neon")]
1387unsafe fn translate_range_to_constant_neon(
1388    src: &[u8],
1389    dst: &mut [u8],
1390    lo: u8,
1391    hi: u8,
1392    replacement: u8,
1393) {
1394    use std::arch::aarch64::*;
1395
1396    unsafe {
1397        let len = src.len();
1398        let sp = src.as_ptr();
1399        let dp = dst.as_mut_ptr();
1400        let lo_v = vdupq_n_u8(lo);
1401        let hi_v = vdupq_n_u8(hi);
1402        let repl_v = vdupq_n_u8(replacement);
1403        let mut i = 0;
1404
1405        while i + 32 <= len {
1406            let in0 = vld1q_u8(sp.add(i));
1407            let in1 = vld1q_u8(sp.add(i + 16));
1408            let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1409            let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1410            vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1411            vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1412            i += 32;
1413        }
1414
1415        if i + 16 <= len {
1416            let input = vld1q_u8(sp.add(i));
1417            let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1418            vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1419            i += 16;
1420        }
1421
1422        while i < len {
1423            let b = *sp.add(i);
1424            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1425            i += 1;
1426        }
1427    }
1428}
1429
1430#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1431fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1432    for (i, &b) in src.iter().enumerate() {
1433        unsafe {
1434            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1435        }
1436    }
1437}
1438
1439#[cfg(target_arch = "x86_64")]
1440#[target_feature(enable = "avx2")]
1441unsafe fn translate_range_to_constant_avx2(
1442    src: &[u8],
1443    dst: &mut [u8],
1444    lo: u8,
1445    hi: u8,
1446    replacement: u8,
1447) {
1448    use std::arch::x86_64::*;
1449    unsafe {
1450        let range = hi - lo;
1451        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1452        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1453        let repl_v = _mm256_set1_epi8(replacement as i8);
1454        let zero = _mm256_setzero_si256();
1455        let len = src.len();
1456        let sp = src.as_ptr();
1457        let dp = dst.as_mut_ptr();
1458        let mut i = 0;
1459        while i + 64 <= len {
1460            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1461            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1462            let bi0 = _mm256_add_epi8(in0, bias_v);
1463            let bi1 = _mm256_add_epi8(in1, bias_v);
1464            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1465            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1466            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1467            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1468            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1469            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1470            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1471            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1472            i += 64;
1473        }
1474        if i + 32 <= len {
1475            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1476            let biased = _mm256_add_epi8(input, bias_v);
1477            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1478            let in_range = _mm256_cmpeq_epi8(gt, zero);
1479            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1480            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1481            i += 32;
1482        }
1483        while i < len {
1484            let b = *sp.add(i);
1485            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1486            i += 1;
1487        }
1488    }
1489}
1490
1491#[cfg(target_arch = "x86_64")]
1492#[target_feature(enable = "sse2")]
1493unsafe fn translate_range_to_constant_sse2(
1494    src: &[u8],
1495    dst: &mut [u8],
1496    lo: u8,
1497    hi: u8,
1498    replacement: u8,
1499) {
1500    use std::arch::x86_64::*;
1501    unsafe {
1502        let range = hi - lo;
1503        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1504        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1505        let repl_v = _mm_set1_epi8(replacement as i8);
1506        let zero = _mm_setzero_si128();
1507        let len = src.len();
1508        let sp = src.as_ptr();
1509        let dp = dst.as_mut_ptr();
1510        let mut i = 0;
1511        while i + 16 <= len {
1512            let input = _mm_loadu_si128(sp.add(i) as *const _);
1513            let biased = _mm_add_epi8(input, bias_v);
1514            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1515            let in_range = _mm_cmpeq_epi8(gt, zero);
1516            let result = _mm_or_si128(
1517                _mm_and_si128(in_range, repl_v),
1518                _mm_andnot_si128(in_range, input),
1519            );
1520            _mm_storeu_si128(dp.add(i) as *mut _, result);
1521            i += 16;
1522        }
1523        while i < len {
1524            let b = *sp.add(i);
1525            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1526            i += 1;
1527        }
1528    }
1529}
1530
1531/// SIMD-accelerated range translation for mmap'd data.
1532/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
1533/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
1534/// When dst is 32-byte aligned (true for large Vec allocations from mmap), uses
1535/// nontemporal stores to bypass cache, avoiding read-for-ownership overhead and
1536/// reducing memory traffic by ~33% for streaming writes.
1537#[cfg(target_arch = "x86_64")]
1538fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1539    if get_simd_level() >= 3 {
1540        // Use nontemporal stores when dst is 32-byte aligned (typical for large allocs)
1541        if dst.as_ptr() as usize & 31 == 0 {
1542            unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1543        } else {
1544            unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1545        }
1546    } else {
1547        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1548    }
1549}
1550
1551#[cfg(target_arch = "x86_64")]
1552#[target_feature(enable = "avx2")]
1553unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1554    use std::arch::x86_64::*;
1555
1556    unsafe {
1557        let range = hi - lo;
1558        // Bias: shift range so lo maps to -128 (signed min).
1559        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
1560        // For input < lo: biased wraps to large positive (signed), > threshold.
1561        // For input > hi: biased > -128+range, > threshold.
1562        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1563        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1564        let offset_v = _mm256_set1_epi8(offset);
1565        let zero = _mm256_setzero_si256();
1566
1567        let len = src.len();
1568        let sp = src.as_ptr();
1569        let dp = dst.as_mut_ptr();
1570        let mut i = 0;
1571
1572        // 2x unrolled: process 64 bytes per iteration for better ILP.
1573        // Load/compute on the second vector while the first is in-flight.
1574        while i + 64 <= len {
1575            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1576            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1577            let bi0 = _mm256_add_epi8(in0, bias_v);
1578            let bi1 = _mm256_add_epi8(in1, bias_v);
1579            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1580            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1581            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1582            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1583            let om0 = _mm256_and_si256(m0, offset_v);
1584            let om1 = _mm256_and_si256(m1, offset_v);
1585            let r0 = _mm256_add_epi8(in0, om0);
1586            let r1 = _mm256_add_epi8(in1, om1);
1587            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1588            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1589            i += 64;
1590        }
1591
1592        // Remaining 32-byte chunk
1593        if i + 32 <= len {
1594            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1595            let biased = _mm256_add_epi8(input, bias_v);
1596            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1597            let mask = _mm256_cmpeq_epi8(gt, zero);
1598            let offset_masked = _mm256_and_si256(mask, offset_v);
1599            let result = _mm256_add_epi8(input, offset_masked);
1600            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1601            i += 32;
1602        }
1603
1604        // SSE2 tail for 16-byte remainder
1605        if i + 16 <= len {
1606            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1607            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1608            let offset_v128 = _mm_set1_epi8(offset);
1609            let zero128 = _mm_setzero_si128();
1610
1611            let input = _mm_loadu_si128(sp.add(i) as *const _);
1612            let biased = _mm_add_epi8(input, bias_v128);
1613            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1614            let mask = _mm_cmpeq_epi8(gt, zero128);
1615            let offset_masked = _mm_and_si128(mask, offset_v128);
1616            let result = _mm_add_epi8(input, offset_masked);
1617            _mm_storeu_si128(dp.add(i) as *mut _, result);
1618            i += 16;
1619        }
1620
1621        // Scalar tail
1622        while i < len {
1623            let b = *sp.add(i);
1624            *dp.add(i) = if b >= lo && b <= hi {
1625                b.wrapping_add(offset as u8)
1626            } else {
1627                b
1628            };
1629            i += 1;
1630        }
1631    }
1632}
1633
1634/// Nontemporal variant of translate_range_avx2: uses _mm256_stream_si256 for stores.
1635/// This bypasses the cache for writes, avoiding read-for-ownership (RFO) traffic on
1636/// the destination buffer. For streaming translate (src → dst, dst not read again),
1637/// this reduces memory traffic by ~33% (10MB input: 20MB vs 30MB total traffic).
1638/// Requires dst to be 32-byte aligned (guaranteed for large Vec/mmap allocations).
1639#[cfg(target_arch = "x86_64")]
1640#[target_feature(enable = "avx2")]
1641unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1642    use std::arch::x86_64::*;
1643
1644    unsafe {
1645        let range = hi - lo;
1646        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1647        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1648        let offset_v = _mm256_set1_epi8(offset);
1649        let zero = _mm256_setzero_si256();
1650
1651        let len = src.len();
1652        let sp = src.as_ptr();
1653        let dp = dst.as_mut_ptr();
1654        let mut i = 0;
1655
1656        // 2x unrolled with nontemporal stores
1657        while i + 64 <= len {
1658            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1659            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1660            let bi0 = _mm256_add_epi8(in0, bias_v);
1661            let bi1 = _mm256_add_epi8(in1, bias_v);
1662            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1663            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1664            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1665            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1666            let om0 = _mm256_and_si256(m0, offset_v);
1667            let om1 = _mm256_and_si256(m1, offset_v);
1668            let r0 = _mm256_add_epi8(in0, om0);
1669            let r1 = _mm256_add_epi8(in1, om1);
1670            _mm256_stream_si256(dp.add(i) as *mut _, r0);
1671            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1672            i += 64;
1673        }
1674
1675        // Remaining 32-byte chunk (still nontemporal if aligned)
1676        if i + 32 <= len {
1677            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1678            let biased = _mm256_add_epi8(input, bias_v);
1679            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1680            let mask = _mm256_cmpeq_epi8(gt, zero);
1681            let offset_masked = _mm256_and_si256(mask, offset_v);
1682            let result = _mm256_add_epi8(input, offset_masked);
1683            _mm256_stream_si256(dp.add(i) as *mut _, result);
1684            i += 32;
1685        }
1686
1687        // SSE2 tail for 16-byte remainder (regular store — only 16 bytes)
1688        if i + 16 <= len {
1689            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1690            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1691            let offset_v128 = _mm_set1_epi8(offset);
1692            let zero128 = _mm_setzero_si128();
1693
1694            let input = _mm_loadu_si128(sp.add(i) as *const _);
1695            let biased = _mm_add_epi8(input, bias_v128);
1696            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1697            let mask = _mm_cmpeq_epi8(gt, zero128);
1698            let offset_masked = _mm_and_si128(mask, offset_v128);
1699            let result = _mm_add_epi8(input, offset_masked);
1700            _mm_storeu_si128(dp.add(i) as *mut _, result);
1701            i += 16;
1702        }
1703
1704        // Scalar tail
1705        while i < len {
1706            let b = *sp.add(i);
1707            *dp.add(i) = if b >= lo && b <= hi {
1708                b.wrapping_add(offset as u8)
1709            } else {
1710                b
1711            };
1712            i += 1;
1713        }
1714
1715        // Fence: ensure nontemporal stores are visible before write() syscall
1716        _mm_sfence();
1717    }
1718}
1719
1720#[cfg(target_arch = "x86_64")]
1721#[target_feature(enable = "sse2")]
1722unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1723    use std::arch::x86_64::*;
1724
1725    unsafe {
1726        let range = hi - lo;
1727        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1728        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1729        let offset_v = _mm_set1_epi8(offset);
1730        let zero = _mm_setzero_si128();
1731
1732        let len = src.len();
1733        let mut i = 0;
1734
1735        while i + 16 <= len {
1736            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1737            let biased = _mm_add_epi8(input, bias_v);
1738            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1739            let mask = _mm_cmpeq_epi8(gt, zero);
1740            let offset_masked = _mm_and_si128(mask, offset_v);
1741            let result = _mm_add_epi8(input, offset_masked);
1742            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1743            i += 16;
1744        }
1745
1746        while i < len {
1747            let b = *src.get_unchecked(i);
1748            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1749                b.wrapping_add(offset as u8)
1750            } else {
1751                b
1752            };
1753            i += 1;
1754        }
1755    }
1756}
1757
1758/// ARM64 NEON-accelerated range translation.
1759/// Processes 16 bytes per iteration using vectorized range check + conditional add.
1760#[cfg(target_arch = "aarch64")]
1761fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1762    unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1763}
1764
1765#[cfg(target_arch = "aarch64")]
1766#[target_feature(enable = "neon")]
1767unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1768    use std::arch::aarch64::*;
1769
1770    unsafe {
1771        let len = src.len();
1772        let sp = src.as_ptr();
1773        let dp = dst.as_mut_ptr();
1774        let lo_v = vdupq_n_u8(lo);
1775        let hi_v = vdupq_n_u8(hi);
1776        let offset_v = vdupq_n_s8(offset);
1777        let mut i = 0;
1778
1779        // 2x unrolled: process 32 bytes per iteration
1780        while i + 32 <= len {
1781            let in0 = vld1q_u8(sp.add(i));
1782            let in1 = vld1q_u8(sp.add(i + 16));
1783            // Range check: (b >= lo) & (b <= hi)
1784            let ge0 = vcgeq_u8(in0, lo_v);
1785            let le0 = vcleq_u8(in0, hi_v);
1786            let mask0 = vandq_u8(ge0, le0);
1787            let ge1 = vcgeq_u8(in1, lo_v);
1788            let le1 = vcleq_u8(in1, hi_v);
1789            let mask1 = vandq_u8(ge1, le1);
1790            // Conditional add: in + (offset & mask)
1791            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1792            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1793            let r0 = vaddq_u8(in0, off0);
1794            let r1 = vaddq_u8(in1, off1);
1795            vst1q_u8(dp.add(i), r0);
1796            vst1q_u8(dp.add(i + 16), r1);
1797            i += 32;
1798        }
1799
1800        if i + 16 <= len {
1801            let input = vld1q_u8(sp.add(i));
1802            let ge = vcgeq_u8(input, lo_v);
1803            let le = vcleq_u8(input, hi_v);
1804            let mask = vandq_u8(ge, le);
1805            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1806            vst1q_u8(dp.add(i), vaddq_u8(input, off));
1807            i += 16;
1808        }
1809
1810        while i < len {
1811            let b = *sp.add(i);
1812            *dp.add(i) = if b >= lo && b <= hi {
1813                b.wrapping_add(offset as u8)
1814            } else {
1815                b
1816            };
1817            i += 1;
1818        }
1819    }
1820}
1821
1822/// Scalar range translation fallback for non-x86_64, non-aarch64 platforms.
1823#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1824fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1825    let offset_u8 = offset as u8;
1826    let range = hi.wrapping_sub(lo);
1827    unsafe {
1828        let sp = src.as_ptr();
1829        let dp = dst.as_mut_ptr();
1830        let len = src.len();
1831        let mut i = 0;
1832        while i + 8 <= len {
1833            macro_rules! do_byte {
1834                ($off:expr) => {{
1835                    let b = *sp.add(i + $off);
1836                    let in_range = b.wrapping_sub(lo) <= range;
1837                    *dp.add(i + $off) = if in_range {
1838                        b.wrapping_add(offset_u8)
1839                    } else {
1840                        b
1841                    };
1842                }};
1843            }
1844            do_byte!(0);
1845            do_byte!(1);
1846            do_byte!(2);
1847            do_byte!(3);
1848            do_byte!(4);
1849            do_byte!(5);
1850            do_byte!(6);
1851            do_byte!(7);
1852            i += 8;
1853        }
1854        while i < len {
1855            let b = *sp.add(i);
1856            let in_range = b.wrapping_sub(lo) <= range;
1857            *dp.add(i) = if in_range {
1858                b.wrapping_add(offset_u8)
1859            } else {
1860                b
1861            };
1862            i += 1;
1863        }
1864    }
1865}
1866
1867// ============================================================================
1868// In-place SIMD range translation (saves one buffer allocation in streaming)
1869// ============================================================================
1870
1871/// In-place SIMD-accelerated range translation.
1872/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
1873/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
1874#[cfg(target_arch = "x86_64")]
1875fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1876    if get_simd_level() >= 3 {
1877        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1878    } else {
1879        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1880    }
1881}
1882
1883#[cfg(target_arch = "x86_64")]
1884#[target_feature(enable = "avx2")]
1885unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1886    use std::arch::x86_64::*;
1887
1888    unsafe {
1889        let range = hi - lo;
1890        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1891        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1892        let offset_v = _mm256_set1_epi8(offset);
1893        let zero = _mm256_setzero_si256();
1894
1895        let len = data.len();
1896        let ptr = data.as_mut_ptr();
1897        let mut i = 0;
1898
1899        // 2x unrolled: process 64 bytes per iteration for better ILP
1900        while i + 64 <= len {
1901            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1902            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1903            let bi0 = _mm256_add_epi8(in0, bias_v);
1904            let bi1 = _mm256_add_epi8(in1, bias_v);
1905            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1906            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1907            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1908            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1909            let om0 = _mm256_and_si256(m0, offset_v);
1910            let om1 = _mm256_and_si256(m1, offset_v);
1911            let r0 = _mm256_add_epi8(in0, om0);
1912            let r1 = _mm256_add_epi8(in1, om1);
1913            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1914            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1915            i += 64;
1916        }
1917
1918        // Remaining 32-byte chunk
1919        if i + 32 <= len {
1920            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1921            let biased = _mm256_add_epi8(input, bias_v);
1922            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1923            let mask = _mm256_cmpeq_epi8(gt, zero);
1924            let offset_masked = _mm256_and_si256(mask, offset_v);
1925            let result = _mm256_add_epi8(input, offset_masked);
1926            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1927            i += 32;
1928        }
1929
1930        if i + 16 <= len {
1931            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1932            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1933            let offset_v128 = _mm_set1_epi8(offset);
1934            let zero128 = _mm_setzero_si128();
1935
1936            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1937            let biased = _mm_add_epi8(input, bias_v128);
1938            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1939            let mask = _mm_cmpeq_epi8(gt, zero128);
1940            let offset_masked = _mm_and_si128(mask, offset_v128);
1941            let result = _mm_add_epi8(input, offset_masked);
1942            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1943            i += 16;
1944        }
1945
1946        while i < len {
1947            let b = *ptr.add(i);
1948            *ptr.add(i) = if b >= lo && b <= hi {
1949                b.wrapping_add(offset as u8)
1950            } else {
1951                b
1952            };
1953            i += 1;
1954        }
1955    }
1956}
1957
1958#[cfg(target_arch = "x86_64")]
1959#[target_feature(enable = "sse2")]
1960unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1961    use std::arch::x86_64::*;
1962
1963    unsafe {
1964        let range = hi - lo;
1965        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1966        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1967        let offset_v = _mm_set1_epi8(offset);
1968        let zero = _mm_setzero_si128();
1969
1970        let len = data.len();
1971        let ptr = data.as_mut_ptr();
1972        let mut i = 0;
1973
1974        while i + 16 <= len {
1975            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1976            let biased = _mm_add_epi8(input, bias_v);
1977            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1978            let mask = _mm_cmpeq_epi8(gt, zero);
1979            let offset_masked = _mm_and_si128(mask, offset_v);
1980            let result = _mm_add_epi8(input, offset_masked);
1981            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1982            i += 16;
1983        }
1984
1985        while i < len {
1986            let b = *ptr.add(i);
1987            *ptr.add(i) = if b >= lo && b <= hi {
1988                b.wrapping_add(offset as u8)
1989            } else {
1990                b
1991            };
1992            i += 1;
1993        }
1994    }
1995}
1996
1997#[cfg(target_arch = "aarch64")]
1998fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1999    unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2000}
2001
2002#[cfg(target_arch = "aarch64")]
2003#[target_feature(enable = "neon")]
2004unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2005    use std::arch::aarch64::*;
2006
2007    unsafe {
2008        let len = data.len();
2009        let ptr = data.as_mut_ptr();
2010        let lo_v = vdupq_n_u8(lo);
2011        let hi_v = vdupq_n_u8(hi);
2012        let offset_v = vdupq_n_s8(offset);
2013        let mut i = 0;
2014
2015        while i + 32 <= len {
2016            let in0 = vld1q_u8(ptr.add(i));
2017            let in1 = vld1q_u8(ptr.add(i + 16));
2018            let ge0 = vcgeq_u8(in0, lo_v);
2019            let le0 = vcleq_u8(in0, hi_v);
2020            let mask0 = vandq_u8(ge0, le0);
2021            let ge1 = vcgeq_u8(in1, lo_v);
2022            let le1 = vcleq_u8(in1, hi_v);
2023            let mask1 = vandq_u8(ge1, le1);
2024            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2025            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2026            vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2027            vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2028            i += 32;
2029        }
2030
2031        if i + 16 <= len {
2032            let input = vld1q_u8(ptr.add(i));
2033            let ge = vcgeq_u8(input, lo_v);
2034            let le = vcleq_u8(input, hi_v);
2035            let mask = vandq_u8(ge, le);
2036            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2037            vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2038            i += 16;
2039        }
2040
2041        while i < len {
2042            let b = *ptr.add(i);
2043            if b >= lo && b <= hi {
2044                *ptr.add(i) = b.wrapping_add(offset as u8);
2045            }
2046            i += 1;
2047        }
2048    }
2049}
2050
2051#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2052fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2053    let offset_u8 = offset as u8;
2054    let range = hi.wrapping_sub(lo);
2055    for b in data.iter_mut() {
2056        if b.wrapping_sub(lo) <= range {
2057            *b = b.wrapping_add(offset_u8);
2058        }
2059    }
2060}
2061
2062// ============================================================================
2063// SIMD range deletion (x86_64)
2064// ============================================================================
2065
2066/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
2067/// Returns Some((lo, hi)) if so. This is true for common classes:
2068/// - `[:digit:]` = 0x30..=0x39
2069/// - `a-z` = 0x61..=0x7A
2070/// - `A-Z` = 0x41..=0x5A
2071#[inline]
2072fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2073    if chars.is_empty() {
2074        return None;
2075    }
2076    let mut lo = chars[0];
2077    let mut hi = chars[0];
2078    for &c in &chars[1..] {
2079        if c < lo {
2080            lo = c;
2081        }
2082        if c > hi {
2083            hi = c;
2084        }
2085    }
2086    // Check that the range size matches the number of chars (no gaps)
2087    // Cast to usize before +1 to avoid u8 overflow when hi=255, lo=0 (range=256)
2088    if (hi as usize - lo as usize + 1) == chars.len() {
2089        Some((lo, hi))
2090    } else {
2091        None
2092    }
2093}
2094
2095/// SIMD-accelerated delete for contiguous byte ranges.
2096/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
2097/// then compacts output by skipping matched bytes.
2098#[cfg(target_arch = "x86_64")]
2099fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2100    if get_simd_level() >= 3 {
2101        unsafe { delete_range_avx2(src, dst, lo, hi) }
2102    } else {
2103        unsafe { delete_range_sse2(src, dst, lo, hi) }
2104    }
2105}
2106
2107#[cfg(target_arch = "x86_64")]
2108#[target_feature(enable = "avx2")]
2109unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2110    use std::arch::x86_64::*;
2111
2112    unsafe {
2113        let range = hi - lo;
2114        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2115        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2116        let zero = _mm256_setzero_si256();
2117
2118        let len = src.len();
2119        let sp = src.as_ptr();
2120        let dp = dst.as_mut_ptr();
2121        let mut ri = 0;
2122        let mut wp = 0;
2123
2124        while ri + 32 <= len {
2125            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2126            let biased = _mm256_add_epi8(input, bias_v);
2127            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
2128            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2129            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
2130            let in_range = _mm256_cmpeq_epi8(gt, zero);
2131            // keep_mask bits: 1 = keep (NOT in range)
2132            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2133
2134            if keep_mask == 0xFFFFFFFF {
2135                // All 32 bytes are kept — bulk copy
2136                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2137                wp += 32;
2138            } else if keep_mask != 0 {
2139                // Partial keep — per-lane processing with all-keep fast paths.
2140                // For 4% delete rate, ~72% of 8-byte lanes are all-keep even
2141                // within partial 32-byte blocks. The per-lane check avoids
2142                // the LUT compact overhead for these clean lanes.
2143                let m0 = keep_mask as u8;
2144                let m1 = (keep_mask >> 8) as u8;
2145                let m2 = (keep_mask >> 16) as u8;
2146                let m3 = (keep_mask >> 24) as u8;
2147
2148                if m0 == 0xFF {
2149                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2150                } else if m0 != 0 {
2151                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2152                }
2153                let c0 = m0.count_ones() as usize;
2154
2155                if m1 == 0xFF {
2156                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2157                } else if m1 != 0 {
2158                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2159                }
2160                let c1 = m1.count_ones() as usize;
2161
2162                if m2 == 0xFF {
2163                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2164                } else if m2 != 0 {
2165                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2166                }
2167                let c2 = m2.count_ones() as usize;
2168
2169                if m3 == 0xFF {
2170                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2171                } else if m3 != 0 {
2172                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2173                }
2174                let c3 = m3.count_ones() as usize;
2175                wp += c0 + c1 + c2 + c3;
2176            }
2177            // else: keep_mask == 0 means all bytes deleted, skip entirely
2178            ri += 32;
2179        }
2180
2181        // SSE2 tail for 16-byte remainder
2182        if ri + 16 <= len {
2183            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2184            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2185            let zero128 = _mm_setzero_si128();
2186
2187            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2188            let biased = _mm_add_epi8(input, bias_v128);
2189            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2190            let in_range = _mm_cmpeq_epi8(gt, zero128);
2191            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2192
2193            if keep_mask == 0xFFFF {
2194                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2195                wp += 16;
2196            } else if keep_mask != 0 {
2197                let m0 = keep_mask as u8;
2198                let m1 = (keep_mask >> 8) as u8;
2199                if m0 == 0xFF {
2200                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2201                } else if m0 != 0 {
2202                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2203                }
2204                let c0 = m0.count_ones() as usize;
2205                if m1 == 0xFF {
2206                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2207                } else if m1 != 0 {
2208                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2209                }
2210                wp += c0 + m1.count_ones() as usize;
2211            }
2212            ri += 16;
2213        }
2214
2215        // Scalar tail — branchless: always store, advance wp only for kept bytes
2216        while ri < len {
2217            let b = *sp.add(ri);
2218            *dp.add(wp) = b;
2219            wp += (b < lo || b > hi) as usize;
2220            ri += 1;
2221        }
2222
2223        wp
2224    }
2225}
2226
2227/// Compact 8 source bytes into contiguous output bytes using a keep mask.
2228/// Each bit in `mask` indicates whether the corresponding byte should be kept.
2229/// Uses a precomputed LUT: for each 8-bit mask, the LUT stores indices of set bits.
2230/// Always performs 8 unconditional stores (extra stores past popcount are harmless
2231/// since the write pointer only advances by popcount, and subsequent lanes overwrite).
2232/// This eliminates the serial tzcnt→blsr dependency chain (~28 cycles) in favor of
2233/// independent indexed loads and stores (~15 cycles).
2234#[cfg(target_arch = "x86_64")]
2235#[inline(always)]
2236unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2237    unsafe {
2238        let idx = COMPACT_LUT.get_unchecked(mask as usize);
2239        *dst = *src.add(*idx.get_unchecked(0) as usize);
2240        *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2241        *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2242        *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2243        *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2244        *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2245        *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2246        *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2247    }
2248}
2249
2250/// SSSE3 pshufb-based byte compaction. Loads 8 source bytes into an XMM register,
2251/// shuffles kept bytes to the front using COMPACT_LUT + _mm_shuffle_epi8, stores 8 bytes.
2252/// ~4x faster than scalar compact_8bytes: 1 pshufb vs 8 individual indexed byte copies.
2253/// Requires SSSE3; safe to call from AVX2 functions (which imply SSSE3).
2254#[cfg(target_arch = "x86_64")]
2255#[target_feature(enable = "ssse3")]
2256#[inline]
2257unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2258    use std::arch::x86_64::*;
2259    unsafe {
2260        let src_v = _mm_loadl_epi64(src as *const _);
2261        let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2262        let out_v = _mm_shuffle_epi8(src_v, shuf);
2263        _mm_storel_epi64(dst as *mut _, out_v);
2264    }
2265}
2266
2267#[cfg(target_arch = "x86_64")]
2268#[target_feature(enable = "sse2")]
2269unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2270    use std::arch::x86_64::*;
2271
2272    unsafe {
2273        let range = hi - lo;
2274        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2275        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2276        let zero = _mm_setzero_si128();
2277
2278        let len = src.len();
2279        let sp = src.as_ptr();
2280        let dp = dst.as_mut_ptr();
2281        let mut ri = 0;
2282        let mut wp = 0;
2283
2284        while ri + 16 <= len {
2285            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2286            let biased = _mm_add_epi8(input, bias_v);
2287            let gt = _mm_cmpgt_epi8(biased, threshold_v);
2288            let in_range = _mm_cmpeq_epi8(gt, zero);
2289            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2290
2291            if keep_mask == 0xFFFF {
2292                // All 16 bytes kept — bulk copy
2293                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2294                wp += 16;
2295            } else if keep_mask != 0 {
2296                let m0 = keep_mask as u8;
2297                let m1 = (keep_mask >> 8) as u8;
2298                if m0 == 0xFF {
2299                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2300                } else if m0 != 0 {
2301                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2302                }
2303                let c0 = m0.count_ones() as usize;
2304                if m1 == 0xFF {
2305                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2306                } else if m1 != 0 {
2307                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2308                }
2309                wp += c0 + m1.count_ones() as usize;
2310            }
2311            ri += 16;
2312        }
2313
2314        // Scalar tail — branchless
2315        while ri < len {
2316            let b = *sp.add(ri);
2317            *dp.add(wp) = b;
2318            wp += (b < lo || b > hi) as usize;
2319            ri += 1;
2320        }
2321
2322        wp
2323    }
2324}
2325
2326/// Branchless range delete fallback for non-x86_64 (ARM64, etc.).
2327/// Unconditional store + conditional pointer advance eliminates branch
2328/// mispredictions. Unrolled 8x for better ILP on out-of-order cores.
2329#[cfg(not(target_arch = "x86_64"))]
2330fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2331    let len = src.len();
2332    let sp = src.as_ptr();
2333    let dp = dst.as_mut_ptr();
2334    let mut wp: usize = 0;
2335    let mut i: usize = 0;
2336
2337    // Unrolled branchless loop — 8 bytes per iteration
2338    while i + 8 <= len {
2339        unsafe {
2340            let b0 = *sp.add(i);
2341            *dp.add(wp) = b0;
2342            wp += (b0 < lo || b0 > hi) as usize;
2343            let b1 = *sp.add(i + 1);
2344            *dp.add(wp) = b1;
2345            wp += (b1 < lo || b1 > hi) as usize;
2346            let b2 = *sp.add(i + 2);
2347            *dp.add(wp) = b2;
2348            wp += (b2 < lo || b2 > hi) as usize;
2349            let b3 = *sp.add(i + 3);
2350            *dp.add(wp) = b3;
2351            wp += (b3 < lo || b3 > hi) as usize;
2352            let b4 = *sp.add(i + 4);
2353            *dp.add(wp) = b4;
2354            wp += (b4 < lo || b4 > hi) as usize;
2355            let b5 = *sp.add(i + 5);
2356            *dp.add(wp) = b5;
2357            wp += (b5 < lo || b5 > hi) as usize;
2358            let b6 = *sp.add(i + 6);
2359            *dp.add(wp) = b6;
2360            wp += (b6 < lo || b6 > hi) as usize;
2361            let b7 = *sp.add(i + 7);
2362            *dp.add(wp) = b7;
2363            wp += (b7 < lo || b7 > hi) as usize;
2364        }
2365        i += 8;
2366    }
2367
2368    // Scalar tail
2369    while i < len {
2370        unsafe {
2371            let b = *sp.add(i);
2372            *dp.add(wp) = b;
2373            wp += (b < lo || b > hi) as usize;
2374        }
2375        i += 1;
2376    }
2377
2378    wp
2379}
2380
2381/// Streaming delete for contiguous byte ranges using SIMD range detection.
2382/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
2383/// When no bytes are deleted from a chunk (common for data with few matches),
2384/// writes directly from the source buffer to avoid the copy overhead.
2385fn delete_range_streaming(
2386    lo: u8,
2387    hi: u8,
2388    reader: &mut impl Read,
2389    writer: &mut impl Write,
2390) -> io::Result<()> {
2391    let mut buf = alloc_uninit_vec(STREAM_BUF);
2392    loop {
2393        let n = read_once(reader, &mut buf)?;
2394        if n == 0 {
2395            break;
2396        }
2397        let wp = delete_range_inplace(&mut buf, n, lo, hi);
2398        if wp > 0 {
2399            writer.write_all(&buf[..wp])?;
2400        }
2401    }
2402    Ok(())
2403}
2404
2405/// In-place range delete: SIMD scan for all-keep blocks + branchless scalar compaction.
2406/// Uses a single buffer — reads at position ri, writes at position wp (wp <= ri always).
2407#[inline]
2408fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2409    #[cfg(target_arch = "x86_64")]
2410    {
2411        let level = get_simd_level();
2412        if level >= 3 {
2413            return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2414        }
2415    }
2416    // Scalar fallback: branchless in-place delete
2417    let ptr = buf.as_mut_ptr();
2418    let mut ri = 0;
2419    let mut wp = 0;
2420    unsafe {
2421        while ri + 8 <= n {
2422            let b0 = *ptr.add(ri);
2423            let b1 = *ptr.add(ri + 1);
2424            let b2 = *ptr.add(ri + 2);
2425            let b3 = *ptr.add(ri + 3);
2426            let b4 = *ptr.add(ri + 4);
2427            let b5 = *ptr.add(ri + 5);
2428            let b6 = *ptr.add(ri + 6);
2429            let b7 = *ptr.add(ri + 7);
2430            *ptr.add(wp) = b0;
2431            wp += (b0 < lo || b0 > hi) as usize;
2432            *ptr.add(wp) = b1;
2433            wp += (b1 < lo || b1 > hi) as usize;
2434            *ptr.add(wp) = b2;
2435            wp += (b2 < lo || b2 > hi) as usize;
2436            *ptr.add(wp) = b3;
2437            wp += (b3 < lo || b3 > hi) as usize;
2438            *ptr.add(wp) = b4;
2439            wp += (b4 < lo || b4 > hi) as usize;
2440            *ptr.add(wp) = b5;
2441            wp += (b5 < lo || b5 > hi) as usize;
2442            *ptr.add(wp) = b6;
2443            wp += (b6 < lo || b6 > hi) as usize;
2444            *ptr.add(wp) = b7;
2445            wp += (b7 < lo || b7 > hi) as usize;
2446            ri += 8;
2447        }
2448        while ri < n {
2449            let b = *ptr.add(ri);
2450            *ptr.add(wp) = b;
2451            wp += (b < lo || b > hi) as usize;
2452            ri += 1;
2453        }
2454    }
2455    wp
2456}
2457
2458/// AVX2 in-place range delete: scan 32 bytes at a time, skip all-keep blocks,
2459/// branchless scalar compaction for mixed blocks.
2460#[cfg(target_arch = "x86_64")]
2461#[target_feature(enable = "avx2")]
2462unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2463    use std::arch::x86_64::*;
2464
2465    unsafe {
2466        let range = hi - lo;
2467        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2468        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2469        let zero = _mm256_setzero_si256();
2470
2471        let ptr = buf.as_mut_ptr();
2472        let mut ri = 0;
2473        let mut wp = 0;
2474
2475        while ri + 32 <= n {
2476            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2477            let biased = _mm256_add_epi8(input, bias_v);
2478            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2479            let in_range = _mm256_cmpeq_epi8(gt, zero);
2480            let del_mask = _mm256_movemask_epi8(in_range) as u32;
2481
2482            if del_mask == 0 {
2483                // All 32 bytes kept
2484                if wp != ri {
2485                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2486                }
2487                wp += 32;
2488            } else if del_mask != 0xFFFFFFFF {
2489                // Mixed block: pshufb-based 8-byte compaction.
2490                // Process 4 × 8-byte sub-chunks using COMPACT_LUT + pshufb.
2491                // Each sub-chunk: load 8 bytes into register (safe for overlap),
2492                // shuffle kept bytes to front, store. 4 SIMD ops vs 32 scalar.
2493                let keep_mask = !del_mask;
2494                let m0 = keep_mask as u8;
2495                let m1 = (keep_mask >> 8) as u8;
2496                let m2 = (keep_mask >> 16) as u8;
2497                let m3 = (keep_mask >> 24) as u8;
2498
2499                let c0 = m0.count_ones() as usize;
2500                let c1 = m1.count_ones() as usize;
2501                let c2 = m2.count_ones() as usize;
2502                let c3 = m3.count_ones() as usize;
2503
2504                // Sub-chunk 0: bytes 0-7
2505                if m0 == 0xFF {
2506                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2507                } else if m0 != 0 {
2508                    let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2509                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2510                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2511                    _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2512                }
2513
2514                // Sub-chunk 1: bytes 8-15
2515                if m1 == 0xFF {
2516                    std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2517                } else if m1 != 0 {
2518                    let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2519                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2520                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2521                    _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2522                }
2523
2524                // Sub-chunk 2: bytes 16-23
2525                if m2 == 0xFF {
2526                    std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2527                } else if m2 != 0 {
2528                    let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2529                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2530                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2531                    _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2532                }
2533
2534                // Sub-chunk 3: bytes 24-31
2535                if m3 == 0xFF {
2536                    std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2537                } else if m3 != 0 {
2538                    let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2539                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2540                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2541                    _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2542                }
2543
2544                wp += c0 + c1 + c2 + c3;
2545            }
2546            // del_mask == 0xFFFFFFFF: all deleted, skip entirely
2547            ri += 32;
2548        }
2549
2550        // Scalar tail
2551        while ri < n {
2552            let b = *ptr.add(ri);
2553            *ptr.add(wp) = b;
2554            wp += (b < lo || b > hi) as usize;
2555            ri += 1;
2556        }
2557
2558        wp
2559    }
2560}
2561
2562// ============================================================================
2563// Streaming functions (Read + Write)
2564// ============================================================================
2565
2566pub fn translate(
2567    set1: &[u8],
2568    set2: &[u8],
2569    reader: &mut impl Read,
2570    writer: &mut impl Write,
2571) -> io::Result<()> {
2572    let table = build_translate_table(set1, set2);
2573
2574    // Check for identity table — pure passthrough (no transformation needed)
2575    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2576    if is_identity {
2577        return passthrough_stream(reader, writer);
2578    }
2579
2580    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
2581    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2582        return translate_range_stream(lo, hi, offset, reader, writer);
2583    }
2584
2585    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
2586    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
2587    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2588        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2589    }
2590
2591    // General case: IN-PLACE translation on a SINGLE buffer.
2592    // Process each read chunk immediately for pipelining: while ftr translates
2593    // and writes chunk N, cat writes chunk N+1 to the pipe.
2594    // SAFETY: all bytes are written by read_once before being translated.
2595    let mut buf = alloc_uninit_vec(STREAM_BUF);
2596    loop {
2597        let n = read_once(reader, &mut buf)?;
2598        if n == 0 {
2599            break;
2600        }
2601        translate_and_write_table(&mut buf, n, &table, writer)?;
2602    }
2603    Ok(())
2604}
2605
2606#[inline]
2607fn translate_and_write_table(
2608    buf: &mut [u8],
2609    total: usize,
2610    table: &[u8; 256],
2611    writer: &mut impl Write,
2612) -> io::Result<()> {
2613    if total >= PARALLEL_THRESHOLD {
2614        let nt = rayon::current_num_threads().max(1);
2615        let cs = (total / nt).max(32 * 1024);
2616        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2617            translate_inplace(chunk, table);
2618        });
2619    } else {
2620        translate_inplace(&mut buf[..total], table);
2621    }
2622    writer.write_all(&buf[..total])
2623}
2624
2625/// Streaming SIMD range translation — single buffer, in-place transform.
2626/// Processes each read chunk immediately for pipelining: while ftr translates
2627/// and writes chunk N, upstream cat writes chunk N+1 to the pipe.
2628/// For chunks >= PARALLEL_THRESHOLD, uses rayon par_chunks_mut for multi-core.
2629fn translate_range_stream(
2630    lo: u8,
2631    hi: u8,
2632    offset: i8,
2633    reader: &mut impl Read,
2634    writer: &mut impl Write,
2635) -> io::Result<()> {
2636    let mut buf = alloc_uninit_vec(STREAM_BUF);
2637    loop {
2638        let n = read_once(reader, &mut buf)?;
2639        if n == 0 {
2640            break;
2641        }
2642        translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2643    }
2644    Ok(())
2645}
2646
2647#[inline]
2648fn translate_and_write_range(
2649    buf: &mut [u8],
2650    total: usize,
2651    lo: u8,
2652    hi: u8,
2653    offset: i8,
2654    writer: &mut impl Write,
2655) -> io::Result<()> {
2656    if total >= PARALLEL_THRESHOLD {
2657        let nt = rayon::current_num_threads().max(1);
2658        let cs = (total / nt).max(32 * 1024);
2659        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2660            translate_range_simd_inplace(chunk, lo, hi, offset);
2661        });
2662    } else {
2663        translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2664    }
2665    writer.write_all(&buf[..total])
2666}
2667
2668/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
2669/// Processes each read chunk immediately for pipelining with upstream cat.
2670/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
2671fn translate_range_to_constant_stream(
2672    lo: u8,
2673    hi: u8,
2674    replacement: u8,
2675    reader: &mut impl Read,
2676    writer: &mut impl Write,
2677) -> io::Result<()> {
2678    let mut buf = alloc_uninit_vec(STREAM_BUF);
2679    loop {
2680        let n = read_once(reader, &mut buf)?;
2681        if n == 0 {
2682            break;
2683        }
2684        translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2685    }
2686    Ok(())
2687}
2688
2689#[inline]
2690fn translate_and_write_range_const(
2691    buf: &mut [u8],
2692    total: usize,
2693    lo: u8,
2694    hi: u8,
2695    replacement: u8,
2696    writer: &mut impl Write,
2697) -> io::Result<()> {
2698    if total >= PARALLEL_THRESHOLD {
2699        let nt = rayon::current_num_threads().max(1);
2700        let cs = (total / nt).max(32 * 1024);
2701        buf[..total].par_chunks_mut(cs).for_each(|chunk| {
2702            translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2703        });
2704    } else {
2705        translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2706    }
2707    writer.write_all(&buf[..total])
2708}
2709
2710/// Pure passthrough: copy stdin to stdout without transformation.
2711/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
2712fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2713    let mut buf = alloc_uninit_vec(STREAM_BUF);
2714    loop {
2715        let n = read_once(reader, &mut buf)?;
2716        if n == 0 {
2717            break;
2718        }
2719        writer.write_all(&buf[..n])?;
2720    }
2721    Ok(())
2722}
2723
2724/// Single-read for pipelining: process data immediately after first read()
2725/// instead of blocking to fill the entire buffer. This enables cat|ftr
2726/// pipelining: while ftr processes the first chunk, cat continues writing
2727/// to the pipe. For 10MB piped input with 8MB pipe buffer, this saves
2728/// ~0.5-1ms by overlapping cat's final writes with ftr's processing.
2729#[inline]
2730fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2731    loop {
2732        match reader.read(buf) {
2733            Ok(n) => return Ok(n),
2734            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2735            Err(e) => return Err(e),
2736        }
2737    }
2738}
2739
2740pub fn translate_squeeze(
2741    set1: &[u8],
2742    set2: &[u8],
2743    reader: &mut impl Read,
2744    writer: &mut impl Write,
2745) -> io::Result<()> {
2746    let table = build_translate_table(set1, set2);
2747    let squeeze_set = build_member_set(set2);
2748
2749    // For single-char squeeze set with range-to-constant translation, use
2750    // fused approach: translate via SIMD, then use memmem to find squeeze points.
2751    if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2752        let squeeze_ch = set2.last().copied().unwrap_or(0);
2753        return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2754    }
2755
2756    // Two-pass optimization for range translations:
2757    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
2758    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
2759    let range_info = detect_range_offset(&table);
2760    let range_const_info = if range_info.is_none() {
2761        detect_range_to_constant(&table)
2762    } else {
2763        None
2764    };
2765
2766    let mut buf = alloc_uninit_vec(STREAM_BUF);
2767    let mut last_squeezed: u16 = 256;
2768
2769    loop {
2770        let n = read_once(reader, &mut buf)?;
2771        if n == 0 {
2772            break;
2773        }
2774        let wp = translate_squeeze_process(
2775            &mut buf,
2776            n,
2777            &table,
2778            &squeeze_set,
2779            range_info,
2780            range_const_info,
2781            &mut last_squeezed,
2782        );
2783        if wp > 0 {
2784            writer.write_all(&buf[..wp])?;
2785        }
2786    }
2787    Ok(())
2788}
2789
2790#[inline]
2791fn translate_squeeze_process(
2792    buf: &mut [u8],
2793    n: usize,
2794    table: &[u8; 256],
2795    squeeze_set: &[u8; 32],
2796    range_info: Option<(u8, u8, i8)>,
2797    range_const_info: Option<(u8, u8, u8)>,
2798    last_squeezed: &mut u16,
2799) -> usize {
2800    // Pass 1: translate
2801    if let Some((lo, hi, offset)) = range_info {
2802        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2803    } else if let Some((lo, hi, replacement)) = range_const_info {
2804        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2805    } else {
2806        translate_inplace(&mut buf[..n], table);
2807    }
2808    // Pass 2: squeeze in-place
2809    let mut wp = 0;
2810    unsafe {
2811        let ptr = buf.as_mut_ptr();
2812        let mut i = 0;
2813        while i + 8 <= n {
2814            macro_rules! squeeze_byte {
2815                ($off:expr) => {
2816                    let b = *ptr.add(i + $off);
2817                    if is_member(squeeze_set, b) {
2818                        if *last_squeezed != b as u16 {
2819                            *last_squeezed = b as u16;
2820                            *ptr.add(wp) = b;
2821                            wp += 1;
2822                        }
2823                    } else {
2824                        *last_squeezed = 256;
2825                        *ptr.add(wp) = b;
2826                        wp += 1;
2827                    }
2828                };
2829            }
2830            squeeze_byte!(0);
2831            squeeze_byte!(1);
2832            squeeze_byte!(2);
2833            squeeze_byte!(3);
2834            squeeze_byte!(4);
2835            squeeze_byte!(5);
2836            squeeze_byte!(6);
2837            squeeze_byte!(7);
2838            i += 8;
2839        }
2840        while i < n {
2841            let b = *ptr.add(i);
2842            if is_member(squeeze_set, b) {
2843                if *last_squeezed == b as u16 {
2844                    i += 1;
2845                    continue;
2846                }
2847                *last_squeezed = b as u16;
2848            } else {
2849                *last_squeezed = 256;
2850            }
2851            *ptr.add(wp) = b;
2852            wp += 1;
2853            i += 1;
2854        }
2855    }
2856    wp
2857}
2858
2859/// Optimized translate+squeeze for single squeeze character.
2860/// After SIMD translation, uses memmem to find consecutive pairs
2861/// and compacts in-place with a single write_all per chunk.
2862fn translate_squeeze_single_ch(
2863    table: &[u8; 256],
2864    squeeze_ch: u8,
2865    _squeeze_set: &[u8; 32],
2866    reader: &mut impl Read,
2867    writer: &mut impl Write,
2868) -> io::Result<()> {
2869    let range_info = detect_range_offset(table);
2870    let range_const_info = if range_info.is_none() {
2871        detect_range_to_constant(table)
2872    } else {
2873        None
2874    };
2875
2876    let pair = [squeeze_ch, squeeze_ch];
2877    let finder = memchr::memmem::Finder::new(&pair);
2878    let mut buf = alloc_uninit_vec(STREAM_BUF);
2879    let mut was_squeeze_char = false;
2880
2881    loop {
2882        let n = read_once(reader, &mut buf)?;
2883        if n == 0 {
2884            break;
2885        }
2886        let wp = translate_squeeze_single_process(
2887            &mut buf,
2888            n,
2889            table,
2890            squeeze_ch,
2891            &finder,
2892            range_info,
2893            range_const_info,
2894            &mut was_squeeze_char,
2895        );
2896        if wp > 0 {
2897            writer.write_all(&buf[..wp])?;
2898        }
2899    }
2900    Ok(())
2901}
2902
2903#[inline]
2904fn translate_squeeze_single_process(
2905    buf: &mut [u8],
2906    n: usize,
2907    table: &[u8; 256],
2908    squeeze_ch: u8,
2909    finder: &memchr::memmem::Finder<'_>,
2910    range_info: Option<(u8, u8, i8)>,
2911    range_const_info: Option<(u8, u8, u8)>,
2912    was_squeeze_char: &mut bool,
2913) -> usize {
2914    // Pass 1: translate in-place
2915    if let Some((lo, hi, offset)) = range_info {
2916        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2917    } else if let Some((lo, hi, replacement)) = range_const_info {
2918        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2919    } else {
2920        translate_inplace(&mut buf[..n], table);
2921    }
2922
2923    // Pass 2: squeeze compaction
2924    let mut i = 0;
2925    if *was_squeeze_char {
2926        while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2927            i += 1;
2928        }
2929        *was_squeeze_char = false;
2930        if i >= n {
2931            *was_squeeze_char = true;
2932            return 0;
2933        }
2934    }
2935
2936    let ptr = buf.as_mut_ptr();
2937    let mut wp = 0usize;
2938
2939    loop {
2940        match finder.find(&buf[i..n]) {
2941            Some(offset) => {
2942                let seg_end = i + offset + 1;
2943                let gap = seg_end - i;
2944                if gap > 0 {
2945                    if wp != i {
2946                        unsafe {
2947                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2948                        }
2949                    }
2950                    wp += gap;
2951                }
2952                i = seg_end;
2953                while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2954                    i += 1;
2955                }
2956                if i >= n {
2957                    *was_squeeze_char = true;
2958                    break;
2959                }
2960            }
2961            None => {
2962                let rem = n - i;
2963                if rem > 0 {
2964                    if wp != i {
2965                        unsafe {
2966                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2967                        }
2968                    }
2969                    wp += rem;
2970                }
2971                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2972                break;
2973            }
2974        }
2975    }
2976    wp
2977}
2978
2979pub fn delete(
2980    delete_chars: &[u8],
2981    reader: &mut impl Read,
2982    writer: &mut impl Write,
2983) -> io::Result<()> {
2984    if delete_chars.len() == 1 {
2985        return delete_single_streaming(delete_chars[0], reader, writer);
2986    }
2987    if delete_chars.len() <= 3 {
2988        return delete_multi_streaming(delete_chars, reader, writer);
2989    }
2990
2991    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
2992    // use vectorized range comparison instead of scalar bitset lookup.
2993    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
2994    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2995        return delete_range_streaming(lo, hi, reader, writer);
2996    }
2997
2998    let member = build_member_set(delete_chars);
2999    let mut buf = alloc_uninit_vec(STREAM_BUF);
3000    // Separate output buffer for SIMD compaction — keeps source data intact
3001    // while compact_8bytes_simd writes to a different location.
3002    let mut outbuf = alloc_uninit_vec(STREAM_BUF);
3003
3004    loop {
3005        let n = read_once(reader, &mut buf)?;
3006        if n == 0 {
3007            break;
3008        }
3009        let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
3010        if wp > 0 {
3011            writer.write_all(&outbuf[..wp])?;
3012        }
3013    }
3014    Ok(())
3015}
3016
3017#[inline]
3018fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3019    #[cfg(target_arch = "x86_64")]
3020    {
3021        if get_simd_level() >= 3 {
3022            return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3023        }
3024    }
3025    delete_bitset_scalar(src, dst, member)
3026}
3027
3028/// Scalar bitset delete: write kept bytes to output buffer.
3029#[inline]
3030fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3031    let n = src.len();
3032    let mut wp = 0;
3033    unsafe {
3034        let sp = src.as_ptr();
3035        let dp = dst.as_mut_ptr();
3036        let mut i = 0;
3037        while i + 8 <= n {
3038            let b0 = *sp.add(i);
3039            let b1 = *sp.add(i + 1);
3040            let b2 = *sp.add(i + 2);
3041            let b3 = *sp.add(i + 3);
3042            let b4 = *sp.add(i + 4);
3043            let b5 = *sp.add(i + 5);
3044            let b6 = *sp.add(i + 6);
3045            let b7 = *sp.add(i + 7);
3046            *dp.add(wp) = b0;
3047            wp += !is_member(member, b0) as usize;
3048            *dp.add(wp) = b1;
3049            wp += !is_member(member, b1) as usize;
3050            *dp.add(wp) = b2;
3051            wp += !is_member(member, b2) as usize;
3052            *dp.add(wp) = b3;
3053            wp += !is_member(member, b3) as usize;
3054            *dp.add(wp) = b4;
3055            wp += !is_member(member, b4) as usize;
3056            *dp.add(wp) = b5;
3057            wp += !is_member(member, b5) as usize;
3058            *dp.add(wp) = b6;
3059            wp += !is_member(member, b6) as usize;
3060            *dp.add(wp) = b7;
3061            wp += !is_member(member, b7) as usize;
3062            i += 8;
3063        }
3064        while i < n {
3065            let b = *sp.add(i);
3066            *dp.add(wp) = b;
3067            wp += !is_member(member, b) as usize;
3068            i += 1;
3069        }
3070    }
3071    wp
3072}
3073
3074/// AVX2 bitset delete for streaming: uses SIMD to check 32 bytes against the
3075/// membership bitset at once, then compact_8bytes_simd to pack kept bytes.
3076#[cfg(target_arch = "x86_64")]
3077#[target_feature(enable = "avx2")]
3078unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3079    use std::arch::x86_64::*;
3080
3081    unsafe {
3082        let n = src.len();
3083        let sp = src.as_ptr();
3084        let dp = dst.as_mut_ptr();
3085        let mut ri = 0;
3086        let mut wp = 0;
3087
3088        // Load the 256-bit membership bitset into an AVX2 register.
3089        // Byte i of member_v has bits set for characters in [i*8..i*8+7].
3090        let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3091
3092        // For each input byte B, we check: member[B >> 3] & (1 << (B & 7))
3093        // Using SIMD: extract byte index (B >> 3) and bit position (B & 7).
3094        let mask7 = _mm256_set1_epi8(7);
3095        let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3096
3097        // Lookup table for (1 << (x & 7)) — pshufb gives per-byte shift
3098        // that _mm256_sllv_epi32 can't do (it works on 32-bit lanes).
3099        let bit_table = _mm256_setr_epi8(
3100            1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3101            0, 0, 0, 0, 0, 0, 0, 0,
3102        );
3103
3104        while ri + 32 <= n {
3105            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3106
3107            // byte_idx = input >> 3 (which byte of the 32-byte member set)
3108            let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3109            // bit_pos = input & 7 (which bit within that byte)
3110            let bit_pos = _mm256_and_si256(input, mask7);
3111            // bit_mask = 1 << bit_pos (per-byte via shuffle lookup)
3112            let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3113
3114            // member_byte = shuffle member_v by byte_idx (pshufb)
3115            // But pshufb only works within 128-bit lanes. We need cross-lane.
3116            // Since member is 32 bytes and byte_idx can be 0-31, we need
3117            // a different approach. Use two pshufb + blend:
3118            // lo_half = pshufb(member[0..15], byte_idx)
3119            // hi_half = pshufb(member[16..31], byte_idx - 16)
3120            // select = byte_idx >= 16
3121            let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3122            let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3123            let lo_mask = _mm256_set1_epi8(0x0F);
3124            let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3125            let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3126            let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3127            // select hi when byte_idx >= 16 (bit 4 set)
3128            let use_hi = _mm256_slli_epi16(byte_idx, 3); // shift bit 4 to bit 7
3129            let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3130
3131            // Check: (member_byte & bit_mask) != 0 → byte is in delete set
3132            let test = _mm256_and_si256(member_byte, bit_mask);
3133            let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3134            // keep_mask: bit set = byte should be KEPT (not in delete set)
3135            let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3136
3137            if keep_mask == 0xFFFFFFFF {
3138                // All 32 bytes kept — bulk copy
3139                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3140                wp += 32;
3141            } else if keep_mask != 0 {
3142                // Partial keep — compact 8 bytes at a time
3143                let m0 = keep_mask as u8;
3144                let m1 = (keep_mask >> 8) as u8;
3145                let m2 = (keep_mask >> 16) as u8;
3146                let m3 = (keep_mask >> 24) as u8;
3147
3148                if m0 == 0xFF {
3149                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3150                } else if m0 != 0 {
3151                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3152                }
3153                let c0 = m0.count_ones() as usize;
3154
3155                if m1 == 0xFF {
3156                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3157                } else if m1 != 0 {
3158                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3159                }
3160                let c1 = m1.count_ones() as usize;
3161
3162                if m2 == 0xFF {
3163                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3164                } else if m2 != 0 {
3165                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3166                }
3167                let c2 = m2.count_ones() as usize;
3168
3169                if m3 == 0xFF {
3170                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3171                } else if m3 != 0 {
3172                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3173                }
3174                let c3 = m3.count_ones() as usize;
3175                wp += c0 + c1 + c2 + c3;
3176            }
3177            // else: all 32 bytes deleted, wp unchanged
3178            ri += 32;
3179        }
3180
3181        // Scalar tail
3182        while ri < n {
3183            let b = *sp.add(ri);
3184            *dp.add(wp) = b;
3185            wp += !is_member(member, b) as usize;
3186            ri += 1;
3187        }
3188
3189        wp
3190    }
3191}
3192
3193fn delete_single_streaming(
3194    ch: u8,
3195    reader: &mut impl Read,
3196    writer: &mut impl Write,
3197) -> io::Result<()> {
3198    let mut buf = alloc_uninit_vec(STREAM_BUF);
3199    loop {
3200        let n = read_once(reader, &mut buf)?;
3201        if n == 0 {
3202            break;
3203        }
3204        let wp = delete_single_inplace(&mut buf, n, ch);
3205        if wp > 0 {
3206            writer.write_all(&buf[..wp])?;
3207        }
3208    }
3209    Ok(())
3210}
3211
3212/// In-place single-char delete using memchr gap-copy.
3213#[inline]
3214fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3215    let mut wp = 0;
3216    let mut i = 0;
3217    while i < n {
3218        match memchr::memchr(ch, &buf[i..n]) {
3219            Some(offset) => {
3220                if offset > 0 {
3221                    if wp != i {
3222                        unsafe {
3223                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3224                        }
3225                    }
3226                    wp += offset;
3227                }
3228                i += offset + 1;
3229            }
3230            None => {
3231                let run_len = n - i;
3232                if run_len > 0 {
3233                    if wp != i {
3234                        unsafe {
3235                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3236                        }
3237                    }
3238                    wp += run_len;
3239                }
3240                break;
3241            }
3242        }
3243    }
3244    wp
3245}
3246
3247fn delete_multi_streaming(
3248    chars: &[u8],
3249    reader: &mut impl Read,
3250    writer: &mut impl Write,
3251) -> io::Result<()> {
3252    let mut buf = alloc_uninit_vec(STREAM_BUF);
3253    loop {
3254        let n = read_once(reader, &mut buf)?;
3255        if n == 0 {
3256            break;
3257        }
3258        let wp = delete_multi_inplace(&mut buf, n, chars);
3259        if wp > 0 {
3260            writer.write_all(&buf[..wp])?;
3261        }
3262    }
3263    Ok(())
3264}
3265
3266/// In-place multi-char delete using memchr2/memchr3 gap-copy.
3267#[inline]
3268fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3269    let mut wp = 0;
3270    let mut i = 0;
3271    while i < n {
3272        let found = if chars.len() == 2 {
3273            memchr::memchr2(chars[0], chars[1], &buf[i..n])
3274        } else {
3275            memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3276        };
3277        match found {
3278            Some(offset) => {
3279                if offset > 0 {
3280                    if wp != i {
3281                        unsafe {
3282                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3283                        }
3284                    }
3285                    wp += offset;
3286                }
3287                i += offset + 1;
3288            }
3289            None => {
3290                let run_len = n - i;
3291                if run_len > 0 {
3292                    if wp != i {
3293                        unsafe {
3294                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3295                        }
3296                    }
3297                    wp += run_len;
3298                }
3299                break;
3300            }
3301        }
3302    }
3303    wp
3304}
3305
3306pub fn delete_squeeze(
3307    delete_chars: &[u8],
3308    squeeze_chars: &[u8],
3309    reader: &mut impl Read,
3310    writer: &mut impl Write,
3311) -> io::Result<()> {
3312    let delete_set = build_member_set(delete_chars);
3313    let squeeze_set = build_member_set(squeeze_chars);
3314    let mut buf = alloc_uninit_vec(STREAM_BUF);
3315    let mut last_squeezed: u16 = 256;
3316
3317    loop {
3318        let n = read_once(reader, &mut buf)?;
3319        if n == 0 {
3320            break;
3321        }
3322        let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3323        if wp > 0 {
3324            writer.write_all(&buf[..wp])?;
3325        }
3326    }
3327    Ok(())
3328}
3329
3330#[inline]
3331fn delete_squeeze_inplace(
3332    buf: &mut [u8],
3333    n: usize,
3334    delete_set: &[u8; 32],
3335    squeeze_set: &[u8; 32],
3336    last_squeezed: &mut u16,
3337) -> usize {
3338    let mut wp = 0;
3339    unsafe {
3340        let ptr = buf.as_mut_ptr();
3341        let mut i = 0;
3342        while i + 8 <= n {
3343            macro_rules! process_byte {
3344                ($off:expr) => {
3345                    let b = *ptr.add(i + $off);
3346                    if !is_member(delete_set, b) {
3347                        if is_member(squeeze_set, b) {
3348                            if *last_squeezed != b as u16 {
3349                                *last_squeezed = b as u16;
3350                                *ptr.add(wp) = b;
3351                                wp += 1;
3352                            }
3353                        } else {
3354                            *last_squeezed = 256;
3355                            *ptr.add(wp) = b;
3356                            wp += 1;
3357                        }
3358                    }
3359                };
3360            }
3361            process_byte!(0);
3362            process_byte!(1);
3363            process_byte!(2);
3364            process_byte!(3);
3365            process_byte!(4);
3366            process_byte!(5);
3367            process_byte!(6);
3368            process_byte!(7);
3369            i += 8;
3370        }
3371        while i < n {
3372            let b = *ptr.add(i);
3373            if !is_member(delete_set, b) {
3374                if is_member(squeeze_set, b) {
3375                    if *last_squeezed != b as u16 {
3376                        *last_squeezed = b as u16;
3377                        *ptr.add(wp) = b;
3378                        wp += 1;
3379                    }
3380                } else {
3381                    *last_squeezed = 256;
3382                    *ptr.add(wp) = b;
3383                    wp += 1;
3384                }
3385            }
3386            i += 1;
3387        }
3388    }
3389    wp
3390}
3391
3392pub fn squeeze(
3393    squeeze_chars: &[u8],
3394    reader: &mut impl Read,
3395    writer: &mut impl Write,
3396) -> io::Result<()> {
3397    if squeeze_chars.len() == 1 {
3398        return squeeze_single_stream(squeeze_chars[0], reader, writer);
3399    }
3400
3401    // For 2-3 squeeze chars, use memchr2/memchr3-based gap-copy
3402    // which gives SIMD-accelerated scanning instead of byte-at-a-time.
3403    if squeeze_chars.len() <= 3 {
3404        return squeeze_multi_stream(squeeze_chars, reader, writer);
3405    }
3406
3407    let member = build_member_set(squeeze_chars);
3408    let mut buf = alloc_uninit_vec(STREAM_BUF);
3409    let mut last_squeezed: u16 = 256;
3410
3411    loop {
3412        let n = read_once(reader, &mut buf)?;
3413        if n == 0 {
3414            break;
3415        }
3416        let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3417        if wp > 0 {
3418            writer.write_all(&buf[..wp])?;
3419        }
3420    }
3421    Ok(())
3422}
3423
3424#[inline]
3425fn squeeze_inplace_bitset(
3426    buf: &mut [u8],
3427    n: usize,
3428    member: &[u8; 32],
3429    last_squeezed: &mut u16,
3430) -> usize {
3431    let mut wp = 0;
3432    unsafe {
3433        let ptr = buf.as_mut_ptr();
3434        for i in 0..n {
3435            let b = *ptr.add(i);
3436            if is_member(member, b) {
3437                if *last_squeezed == b as u16 {
3438                    continue;
3439                }
3440                *last_squeezed = b as u16;
3441            } else {
3442                *last_squeezed = 256;
3443            }
3444            *ptr.add(wp) = b;
3445            wp += 1;
3446        }
3447    }
3448    wp
3449}
3450
3451/// Streaming squeeze for 2-3 chars using memchr2/memchr3 SIMD scanning.
3452/// Builds writev IoSlice entries pointing into the read buffer, skipping
3453/// duplicate runs of squeezable characters. Zero-copy between squeeze points.
3454fn squeeze_multi_stream(
3455    chars: &[u8],
3456    reader: &mut impl Read,
3457    writer: &mut impl Write,
3458) -> io::Result<()> {
3459    let c0 = chars[0];
3460    let c1 = chars[1];
3461    let c2 = if chars.len() >= 3 {
3462        Some(chars[2])
3463    } else {
3464        None
3465    };
3466
3467    let mut buf = alloc_uninit_vec(STREAM_BUF);
3468    let mut last_squeezed: u16 = 256;
3469
3470    loop {
3471        let n = read_once(reader, &mut buf)?;
3472        if n == 0 {
3473            break;
3474        }
3475        let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3476        if wp > 0 {
3477            writer.write_all(&buf[..wp])?;
3478        }
3479    }
3480    Ok(())
3481}
3482
3483/// In-place multi-char squeeze using memchr2/memchr3 gap-copy.
3484#[inline]
3485fn squeeze_multi_compact(
3486    buf: &mut [u8],
3487    n: usize,
3488    c0: u8,
3489    c1: u8,
3490    c2: Option<u8>,
3491    last_squeezed: &mut u16,
3492) -> usize {
3493    let ptr = buf.as_mut_ptr();
3494    let mut wp = 0usize;
3495    let mut cursor = 0usize;
3496
3497    while cursor < n {
3498        let found = if let Some(c) = c2 {
3499            memchr::memchr3(c0, c1, c, &buf[cursor..n])
3500        } else {
3501            memchr::memchr2(c0, c1, &buf[cursor..n])
3502        };
3503        match found {
3504            Some(offset) => {
3505                let pos = cursor + offset;
3506                let b = unsafe { *ptr.add(pos) };
3507
3508                let gap = pos - cursor;
3509                if gap > 0 {
3510                    if wp != cursor {
3511                        unsafe {
3512                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3513                        }
3514                    }
3515                    wp += gap;
3516                    *last_squeezed = 256;
3517                }
3518
3519                if *last_squeezed != b as u16 {
3520                    unsafe { *ptr.add(wp) = b };
3521                    wp += 1;
3522                    *last_squeezed = b as u16;
3523                }
3524
3525                cursor = pos + 1;
3526                while cursor < n && unsafe { *ptr.add(cursor) } == b {
3527                    cursor += 1;
3528                }
3529            }
3530            None => {
3531                let rem = n - cursor;
3532                if rem > 0 {
3533                    if wp != cursor {
3534                        unsafe {
3535                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3536                        }
3537                    }
3538                    wp += rem;
3539                    *last_squeezed = 256;
3540                }
3541                break;
3542            }
3543        }
3544    }
3545    wp
3546}
3547
3548fn squeeze_single_stream(
3549    ch: u8,
3550    reader: &mut impl Read,
3551    writer: &mut impl Write,
3552) -> io::Result<()> {
3553    let pair = [ch, ch];
3554    let finder = memchr::memmem::Finder::new(&pair);
3555    let mut buf = alloc_uninit_vec(STREAM_BUF);
3556    let mut was_squeeze_char = false;
3557
3558    loop {
3559        let n = read_once(reader, &mut buf)?;
3560        if n == 0 {
3561            break;
3562        }
3563        let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3564        if wp > 0 {
3565            writer.write_all(&buf[..wp])?;
3566        }
3567    }
3568    Ok(())
3569}
3570
3571/// In-place squeeze compaction for single-char using memmem.
3572#[inline]
3573fn squeeze_single_compact(
3574    buf: &mut [u8],
3575    n: usize,
3576    ch: u8,
3577    finder: &memchr::memmem::Finder<'_>,
3578    was_squeeze_char: &mut bool,
3579) -> usize {
3580    let mut i = 0;
3581
3582    // Handle carry-over from previous flush
3583    if *was_squeeze_char {
3584        while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3585            i += 1;
3586        }
3587        *was_squeeze_char = false;
3588        if i >= n {
3589            *was_squeeze_char = true;
3590            return 0;
3591        }
3592    }
3593
3594    let ptr = buf.as_mut_ptr();
3595    let mut wp = 0usize;
3596
3597    loop {
3598        match finder.find(&buf[i..n]) {
3599            Some(offset) => {
3600                let seg_end = i + offset + 1;
3601                let gap = seg_end - i;
3602                if gap > 0 {
3603                    if wp != i {
3604                        unsafe {
3605                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3606                        }
3607                    }
3608                    wp += gap;
3609                }
3610                i = seg_end;
3611                while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3612                    i += 1;
3613                }
3614                if i >= n {
3615                    *was_squeeze_char = true;
3616                    break;
3617                }
3618            }
3619            None => {
3620                let rem = n - i;
3621                if rem > 0 {
3622                    if wp != i {
3623                        unsafe {
3624                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3625                        }
3626                    }
3627                    wp += rem;
3628                }
3629                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3630                break;
3631            }
3632        }
3633    }
3634    wp
3635}
3636
3637// ============================================================================
3638// Batch in-place functions (owned data from piped stdin)
3639// ============================================================================
3640
3641/// Translate bytes in-place on an owned buffer, then write.
3642/// For piped stdin where we own the data, this avoids the separate output buffer
3643/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
3644pub fn translate_owned(
3645    set1: &[u8],
3646    set2: &[u8],
3647    data: &mut [u8],
3648    writer: &mut impl Write,
3649) -> io::Result<()> {
3650    let table = build_translate_table(set1, set2);
3651
3652    // Identity table — pure passthrough
3653    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3654    if is_identity {
3655        return writer.write_all(data);
3656    }
3657
3658    // For owned data (piped stdin), rayon's thread pool initialization cost
3659    // (~0.5ms) dominates for data < 32MB. AVX2 processes ~20GB/s per core,
3660    // so 10MB takes ~0.5ms single-threaded. Rayon only helps for >= 32MB
3661    // where the parallel savings clearly exceed the thread pool overhead.
3662    const OWNED_PARALLEL_MIN: usize = 32 * 1024 * 1024;
3663
3664    // SIMD range fast path (in-place)
3665    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3666        if data.len() >= OWNED_PARALLEL_MIN {
3667            let n_threads = rayon::current_num_threads().max(1);
3668            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3669            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3670                translate_range_simd_inplace(chunk, lo, hi, offset);
3671            });
3672        } else {
3673            translate_range_simd_inplace(data, lo, hi, offset);
3674        }
3675        return writer.write_all(data);
3676    }
3677
3678    // SIMD range-to-constant fast path (in-place)
3679    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3680        if data.len() >= OWNED_PARALLEL_MIN {
3681            let n_threads = rayon::current_num_threads().max(1);
3682            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3683            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3684                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3685            });
3686        } else {
3687            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3688        }
3689        return writer.write_all(data);
3690    }
3691
3692    // General table lookup (in-place)
3693    if data.len() >= OWNED_PARALLEL_MIN {
3694        let n_threads = rayon::current_num_threads().max(1);
3695        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3696        data.par_chunks_mut(chunk_size).for_each(|chunk| {
3697            translate_inplace(chunk, &table);
3698        });
3699    } else {
3700        translate_inplace(data, &table);
3701    }
3702    writer.write_all(data)
3703}
3704
3705// ============================================================================
3706// Mmap-based functions (zero-copy input from byte slice)
3707// ============================================================================
3708
3709/// Maximum data size for single-allocation translate approach.
3710/// Translate bytes from an mmap'd byte slice.
3711/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
3712/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
3713/// Falls back to scalar 256-byte table lookup for general translations.
3714///
3715/// For data >= 2MB: uses rayon parallel processing across multiple cores.
3716/// For data <= 16MB: single allocation + single write_all (1 syscall).
3717/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
3718pub fn translate_mmap(
3719    set1: &[u8],
3720    set2: &[u8],
3721    data: &[u8],
3722    writer: &mut impl Write,
3723) -> io::Result<()> {
3724    let table = build_translate_table(set1, set2);
3725
3726    // Check if table is identity — pure passthrough
3727    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3728    if is_identity {
3729        return writer.write_all(data);
3730    }
3731
3732    // Try SIMD fast path for single-range constant-offset translations
3733    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3734        return translate_mmap_range(data, writer, lo, hi, offset);
3735    }
3736
3737    // Try SIMD fast path for range-to-constant translations
3738    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3739        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3740    }
3741
3742    // General case: table lookup (with parallel processing for large data)
3743    translate_mmap_table(data, writer, &table)
3744}
3745
3746/// SIMD range translate for mmap data, with rayon parallel processing.
3747fn translate_mmap_range(
3748    data: &[u8],
3749    writer: &mut impl Write,
3750    lo: u8,
3751    hi: u8,
3752    offset: i8,
3753) -> io::Result<()> {
3754    // Parallel path: split data into chunks, translate each in parallel
3755    if data.len() >= PARALLEL_THRESHOLD {
3756        let mut buf = alloc_uninit_vec(data.len());
3757        let n_threads = rayon::current_num_threads().max(1);
3758        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3759
3760        // Process chunks in parallel: each thread writes to its slice of buf
3761        data.par_chunks(chunk_size)
3762            .zip(buf.par_chunks_mut(chunk_size))
3763            .for_each(|(src_chunk, dst_chunk)| {
3764                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3765            });
3766
3767        return writer.write_all(&buf);
3768    }
3769
3770    // Chunked SIMD translate: 256KB buffer fits in L2 cache.
3771    const CHUNK: usize = 256 * 1024;
3772    let buf_size = data.len().min(CHUNK);
3773    let mut buf = alloc_uninit_vec(buf_size);
3774    for chunk in data.chunks(CHUNK) {
3775        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3776        writer.write_all(&buf[..chunk.len()])?;
3777    }
3778    Ok(())
3779}
3780
3781/// SIMD range-to-constant translate for mmap data.
3782/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
3783fn translate_mmap_range_to_constant(
3784    data: &[u8],
3785    writer: &mut impl Write,
3786    lo: u8,
3787    hi: u8,
3788    replacement: u8,
3789) -> io::Result<()> {
3790    // For mmap data (read-only), copy to buffer and translate in-place
3791    if data.len() >= PARALLEL_THRESHOLD {
3792        let mut buf = alloc_uninit_vec(data.len());
3793        let n_threads = rayon::current_num_threads().max(1);
3794        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3795
3796        // Copy + translate in parallel
3797        data.par_chunks(chunk_size)
3798            .zip(buf.par_chunks_mut(chunk_size))
3799            .for_each(|(src_chunk, dst_chunk)| {
3800                dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3801                translate_range_to_constant_simd_inplace(
3802                    &mut dst_chunk[..src_chunk.len()],
3803                    lo,
3804                    hi,
3805                    replacement,
3806                );
3807            });
3808
3809        return writer.write_all(&buf);
3810    }
3811
3812    // Chunked translate: 256KB buffer fits in L2 cache.
3813    const CHUNK: usize = 256 * 1024;
3814    let buf_size = data.len().min(CHUNK);
3815    let mut buf = alloc_uninit_vec(buf_size);
3816    for chunk in data.chunks(CHUNK) {
3817        buf[..chunk.len()].copy_from_slice(chunk);
3818        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3819        writer.write_all(&buf[..chunk.len()])?;
3820    }
3821    Ok(())
3822}
3823
3824/// General table-lookup translate for mmap data, with rayon parallel processing.
3825fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3826    // Parallel path: split data into chunks, translate each in parallel
3827    if data.len() >= PARALLEL_THRESHOLD {
3828        let mut buf = alloc_uninit_vec(data.len());
3829        let n_threads = rayon::current_num_threads().max(1);
3830        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3831
3832        data.par_chunks(chunk_size)
3833            .zip(buf.par_chunks_mut(chunk_size))
3834            .for_each(|(src_chunk, dst_chunk)| {
3835                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3836            });
3837
3838        return writer.write_all(&buf);
3839    }
3840
3841    // Chunked translate: 256KB buffer fits in L2 cache.
3842    const CHUNK: usize = 256 * 1024;
3843    let buf_size = data.len().min(CHUNK);
3844    let mut buf = alloc_uninit_vec(buf_size);
3845    for chunk in data.chunks(CHUNK) {
3846        translate_to(chunk, &mut buf[..chunk.len()], table);
3847        writer.write_all(&buf[..chunk.len()])?;
3848    }
3849    Ok(())
3850}
3851
3852/// Translate bytes in-place on a mutable buffer (e.g., MAP_PRIVATE mmap).
3853/// Eliminates the output buffer allocation entirely — the kernel's COW
3854/// semantics mean only modified pages are physically copied.
3855///
3856/// For data >= PARALLEL_THRESHOLD: rayon parallel in-place translate.
3857/// Otherwise: single-threaded in-place translate.
3858pub fn translate_mmap_inplace(
3859    set1: &[u8],
3860    set2: &[u8],
3861    data: &mut [u8],
3862    writer: &mut impl Write,
3863) -> io::Result<()> {
3864    let table = build_translate_table(set1, set2);
3865
3866    // Check if table is identity — pure passthrough
3867    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3868    if is_identity {
3869        return writer.write_all(data);
3870    }
3871
3872    // For data that's being translated in a MAP_PRIVATE mmap, every modified page
3873    // triggers a COW fault. For small-to-medium files where most bytes change,
3874    // reading from mmap (read-only) + writing to a separate heap buffer is faster
3875    // because it avoids COW faults entirely. The output buffer is fresh memory
3876    // (no COW), and the input mmap stays read-only (MADV_SEQUENTIAL).
3877    // Threshold: 64MB. For benchmark-sized files (10MB), avoid COW entirely.
3878    const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3879
3880    if data.len() < SEPARATE_BUF_THRESHOLD {
3881        return translate_to_separate_buf(data, &table, writer);
3882    }
3883
3884    // Try SIMD fast path for single-range constant-offset translations (e.g., a-z -> A-Z)
3885    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3886        if data.len() >= PARALLEL_THRESHOLD {
3887            let n_threads = rayon::current_num_threads().max(1);
3888            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3889            data.par_chunks_mut(chunk_size)
3890                .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3891        } else {
3892            translate_range_simd_inplace(data, lo, hi, offset);
3893        }
3894        return writer.write_all(data);
3895    }
3896
3897    // Try SIMD fast path for range-to-constant translations
3898    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3899        if data.len() >= PARALLEL_THRESHOLD {
3900            let n_threads = rayon::current_num_threads().max(1);
3901            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3902            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3903                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3904            });
3905        } else {
3906            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3907        }
3908        return writer.write_all(data);
3909    }
3910
3911    // General case: in-place table lookup
3912    if data.len() >= PARALLEL_THRESHOLD {
3913        let n_threads = rayon::current_num_threads().max(1);
3914        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3915        data.par_chunks_mut(chunk_size)
3916            .for_each(|chunk| translate_inplace(chunk, &table));
3917    } else {
3918        translate_inplace(data, &table);
3919    }
3920    writer.write_all(data)
3921}
3922
3923/// Translate from read-only source to a separate output buffer, avoiding COW faults.
3924/// Uses the appropriate SIMD path (range offset, range-to-constant, or general nibble).
3925///
3926/// For data >= PARALLEL_THRESHOLD: parallel chunked translate into full-size buffer.
3927/// For smaller data: single full-size allocation + single write_all for minimum
3928/// syscall overhead. At 10MB, the allocation is cheap and a single write() is faster
3929/// than multiple 4MB chunked writes.
3930fn translate_to_separate_buf(
3931    data: &[u8],
3932    table: &[u8; 256],
3933    writer: &mut impl Write,
3934) -> io::Result<()> {
3935    let range_info = detect_range_offset(table);
3936    let const_info = if range_info.is_none() {
3937        detect_range_to_constant(table)
3938    } else {
3939        None
3940    };
3941
3942    if data.len() >= PARALLEL_THRESHOLD {
3943        // Parallel path: full-size output buffer, parallel translate, single write.
3944        let mut out_buf = alloc_uninit_vec(data.len());
3945        let n_threads = rayon::current_num_threads().max(1);
3946        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3947
3948        if let Some((lo, hi, offset)) = range_info {
3949            data.par_chunks(chunk_size)
3950                .zip(out_buf.par_chunks_mut(chunk_size))
3951                .for_each(|(src, dst)| {
3952                    translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3953                });
3954        } else if let Some((lo, hi, replacement)) = const_info {
3955            data.par_chunks(chunk_size)
3956                .zip(out_buf.par_chunks_mut(chunk_size))
3957                .for_each(|(src, dst)| {
3958                    translate_range_to_constant_simd(
3959                        src,
3960                        &mut dst[..src.len()],
3961                        lo,
3962                        hi,
3963                        replacement,
3964                    );
3965                });
3966        } else {
3967            data.par_chunks(chunk_size)
3968                .zip(out_buf.par_chunks_mut(chunk_size))
3969                .for_each(|(src, dst)| {
3970                    translate_to(src, &mut dst[..src.len()], table);
3971                });
3972        }
3973        return writer.write_all(&out_buf);
3974    }
3975
3976    // Single-allocation translate: full-size output buffer, single translate, single write.
3977    // For 10MB data, this does 1 write() instead of 40 chunked writes, eliminating
3978    // 39 write() syscalls. SIMD translate streams through src and dst sequentially,
3979    // so the L2 cache argument for 256KB chunks doesn't apply (src data doesn't fit
3980    // in L2 anyway). The reduced syscall overhead more than compensates.
3981    let mut out_buf = alloc_uninit_vec(data.len());
3982    if let Some((lo, hi, offset)) = range_info {
3983        translate_range_simd(data, &mut out_buf, lo, hi, offset);
3984    } else if let Some((lo, hi, replacement)) = const_info {
3985        translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3986    } else {
3987        translate_to(data, &mut out_buf, table);
3988    }
3989    writer.write_all(&out_buf)
3990}
3991
3992/// Translate from a read-only mmap (or any byte slice) to a separate output buffer.
3993/// Avoids MAP_PRIVATE COW page faults by reading from the original data and
3994/// writing to a freshly allocated heap buffer.
3995pub fn translate_mmap_readonly(
3996    set1: &[u8],
3997    set2: &[u8],
3998    data: &[u8],
3999    writer: &mut impl Write,
4000) -> io::Result<()> {
4001    let table = build_translate_table(set1, set2);
4002
4003    // Check if table is identity — pure passthrough
4004    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
4005    if is_identity {
4006        return writer.write_all(data);
4007    }
4008
4009    translate_to_separate_buf(data, &table, writer)
4010}
4011
4012/// Translate + squeeze from mmap'd byte slice.
4013///
4014/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
4015/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
4016/// For data > 16MB: chunked approach to limit memory.
4017pub fn translate_squeeze_mmap(
4018    set1: &[u8],
4019    set2: &[u8],
4020    data: &[u8],
4021    writer: &mut impl Write,
4022) -> io::Result<()> {
4023    let table = build_translate_table(set1, set2);
4024    let squeeze_set = build_member_set(set2);
4025
4026    // For large data: two-phase approach
4027    // Phase 1: parallel translate into buffer
4028    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
4029    //          (squeeze only removes bytes, never grows, so no second allocation needed)
4030    if data.len() >= PARALLEL_THRESHOLD {
4031        // Phase 1: parallel translate
4032        let mut translated = alloc_uninit_vec(data.len());
4033        let range_info = detect_range_offset(&table);
4034        let n_threads = rayon::current_num_threads().max(1);
4035        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4036
4037        if let Some((lo, hi, offset)) = range_info {
4038            data.par_chunks(chunk_size)
4039                .zip(translated.par_chunks_mut(chunk_size))
4040                .for_each(|(src_chunk, dst_chunk)| {
4041                    translate_range_simd(
4042                        src_chunk,
4043                        &mut dst_chunk[..src_chunk.len()],
4044                        lo,
4045                        hi,
4046                        offset,
4047                    );
4048                });
4049        } else {
4050            data.par_chunks(chunk_size)
4051                .zip(translated.par_chunks_mut(chunk_size))
4052                .for_each(|(src_chunk, dst_chunk)| {
4053                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
4054                });
4055        }
4056
4057        // Phase 2: squeeze in-place on the translated buffer.
4058        // Since squeeze only removes bytes (never grows), we can read ahead and
4059        // compact into the same buffer, saving a full data.len() heap allocation.
4060        let mut last_squeezed: u16 = 256;
4061        let len = translated.len();
4062        let mut wp = 0;
4063        unsafe {
4064            let ptr = translated.as_mut_ptr();
4065            let mut i = 0;
4066            while i < len {
4067                let b = *ptr.add(i);
4068                if is_member(&squeeze_set, b) {
4069                    if last_squeezed == b as u16 {
4070                        i += 1;
4071                        continue;
4072                    }
4073                    last_squeezed = b as u16;
4074                } else {
4075                    last_squeezed = 256;
4076                }
4077                *ptr.add(wp) = b;
4078                wp += 1;
4079                i += 1;
4080            }
4081        }
4082        return writer.write_all(&translated[..wp]);
4083    }
4084
4085    // Single-allocation translate+squeeze: full-size buffer, single write_all.
4086    // For 10MB data, this does 1 write() instead of ~40 chunked writes.
4087    let mut buf = alloc_uninit_vec(data.len());
4088    translate_to(data, &mut buf, &table);
4089    let mut last_squeezed: u16 = 256;
4090    let mut wp = 0;
4091    unsafe {
4092        let ptr = buf.as_mut_ptr();
4093        for i in 0..data.len() {
4094            let b = *ptr.add(i);
4095            if is_member(&squeeze_set, b) {
4096                if last_squeezed == b as u16 {
4097                    continue;
4098                }
4099                last_squeezed = b as u16;
4100            } else {
4101                last_squeezed = 256;
4102            }
4103            *ptr.add(wp) = b;
4104            wp += 1;
4105        }
4106    }
4107    writer.write_all(&buf[..wp])
4108}
4109
4110/// Delete from mmap'd byte slice.
4111///
4112/// For data >= 2MB: uses rayon parallel processing across multiple cores.
4113/// For data <= 16MB: delete into one buffer, one write syscall.
4114/// For data > 16MB: chunked approach to limit memory.
4115pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4116    if delete_chars.len() == 1 {
4117        return delete_single_char_mmap(delete_chars[0], data, writer);
4118    }
4119    if delete_chars.len() <= 3 {
4120        return delete_multi_memchr_mmap(delete_chars, data, writer);
4121    }
4122
4123    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
4124    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4125        return delete_range_mmap(data, writer, lo, hi);
4126    }
4127
4128    let member = build_member_set(delete_chars);
4129
4130    // Heuristic: estimate total delete positions. Zero-copy writev is only efficient
4131    // when all gaps fit in a single writev call (< MAX_IOV/2 entries). With uniform
4132    // distribution, each delete creates an IoSlice entry. For many deletes (> 512),
4133    // multiple writev calls are needed, and the compact approach is faster.
4134    let sample_size = data.len().min(1024);
4135    let sample_deletes = data[..sample_size]
4136        .iter()
4137        .filter(|&&b| is_member(&member, b))
4138        .count();
4139    let estimated_deletes = if sample_size > 0 {
4140        data.len() * sample_deletes / sample_size
4141    } else {
4142        data.len()
4143    };
4144
4145    if estimated_deletes < MAX_IOV / 2 {
4146        return delete_bitset_zerocopy(data, &member, writer);
4147    }
4148
4149    // Dense delete: parallel compact with writev (avoids scatter-gather copy)
4150    if data.len() >= PARALLEL_THRESHOLD {
4151        let n_threads = rayon::current_num_threads().max(1);
4152        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4153
4154        let mut outbuf = alloc_uninit_vec(data.len());
4155        let chunk_lens: Vec<usize> = data
4156            .par_chunks(chunk_size)
4157            .zip(outbuf.par_chunks_mut(chunk_size))
4158            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4159            .collect();
4160
4161        // Use writev to write each chunk at its original position, avoiding
4162        // the O(N) scatter-gather memmove. With ~4 threads, that's 4 IoSlice
4163        // entries — far below MAX_IOV.
4164        let slices: Vec<std::io::IoSlice> = chunk_lens
4165            .iter()
4166            .enumerate()
4167            .filter(|&(_, &len)| len > 0)
4168            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4169            .collect();
4170        return write_ioslices(writer, &slices);
4171    }
4172
4173    // Streaming compact: 256KB output buffer reduces page fault overhead.
4174    // For 10MB data: ~64 page faults instead of ~2500, with ~40 write_all calls.
4175    const COMPACT_BUF: usize = 256 * 1024;
4176    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4177
4178    for chunk in data.chunks(COMPACT_BUF) {
4179        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4180        if out_pos > 0 {
4181            writer.write_all(&outbuf[..out_pos])?;
4182        }
4183    }
4184    Ok(())
4185}
4186
4187/// SIMD range delete for mmap data.
4188/// Uses a density heuristic: for sparse deletes (< 15%), uses zero-copy writev
4189/// directly from mmap data (no output buffer allocation). For dense deletes,
4190/// uses SIMD compact into a pre-allocated buffer.
4191fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4192    // Sample first 1024 bytes to estimate delete density
4193    let sample_size = data.len().min(1024);
4194    let sample_deletes = data[..sample_size]
4195        .iter()
4196        .filter(|&&b| b >= lo && b <= hi)
4197        .count();
4198    // Estimate expected number of delete positions (IoSlice entries for zero-copy).
4199    // Each delete creates an IoSlice entry. With MAX_IOV=1024 per writev,
4200    // if estimated_deletes > MAX_IOV/2, the writev overhead from multiple syscalls
4201    // exceeds the compact approach cost. Only use zero-copy when all gaps fit in
4202    // a single writev call.
4203    let estimated_deletes = if sample_size > 0 {
4204        data.len() * sample_deletes / sample_size
4205    } else {
4206        data.len()
4207    };
4208    if estimated_deletes < MAX_IOV / 2 {
4209        return delete_range_mmap_zerocopy(data, writer, lo, hi);
4210    }
4211
4212    // Dense deletes: parallel compact with writev (avoids scatter-gather copy)
4213    if data.len() >= PARALLEL_THRESHOLD {
4214        let n_threads = rayon::current_num_threads().max(1);
4215        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4216
4217        let mut outbuf = alloc_uninit_vec(data.len());
4218        let chunk_lens: Vec<usize> = data
4219            .par_chunks(chunk_size)
4220            .zip(outbuf.par_chunks_mut(chunk_size))
4221            .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4222            .collect();
4223
4224        // Use writev to write each chunk at its original position, avoiding
4225        // the O(N) scatter-gather memmove.
4226        let slices: Vec<std::io::IoSlice> = chunk_lens
4227            .iter()
4228            .enumerate()
4229            .filter(|&(_, &len)| len > 0)
4230            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4231            .collect();
4232        return write_ioslices(writer, &slices);
4233    }
4234
4235    // Streaming compact: use 256KB output buffer instead of full data.len() buffer.
4236    // This reduces page fault overhead from ~2500 faults (10MB) to ~64 faults (256KB).
4237    // The extra write_all calls (~40 for 10MB) are negligible cost.
4238    const COMPACT_BUF: usize = 256 * 1024;
4239    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4240
4241    #[cfg(target_arch = "x86_64")]
4242    {
4243        let mut wp = 0;
4244        let level = get_simd_level();
4245        let len = data.len();
4246        let sp = data.as_ptr();
4247        let dp = outbuf.as_mut_ptr();
4248        let mut ri = 0;
4249
4250        if level >= 3 {
4251            use std::arch::x86_64::*;
4252            let range = hi - lo;
4253            let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4254            let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4255            let zero = unsafe { _mm256_setzero_si256() };
4256
4257            while ri + 32 <= len {
4258                // Flush when output buffer is nearly full
4259                if wp + 32 > COMPACT_BUF {
4260                    writer.write_all(&outbuf[..wp])?;
4261                    wp = 0;
4262                }
4263
4264                let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4265                let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4266                let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4267                let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4268                let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4269
4270                if keep_mask == 0xFFFFFFFF {
4271                    unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4272                    wp += 32;
4273                } else if keep_mask != 0 {
4274                    let m0 = keep_mask as u8;
4275                    let m1 = (keep_mask >> 8) as u8;
4276                    let m2 = (keep_mask >> 16) as u8;
4277                    let m3 = (keep_mask >> 24) as u8;
4278
4279                    if m0 == 0xFF {
4280                        unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4281                    } else if m0 != 0 {
4282                        unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4283                    }
4284                    let c0 = m0.count_ones() as usize;
4285
4286                    if m1 == 0xFF {
4287                        unsafe {
4288                            std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4289                        };
4290                    } else if m1 != 0 {
4291                        unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4292                    }
4293                    let c1 = m1.count_ones() as usize;
4294
4295                    if m2 == 0xFF {
4296                        unsafe {
4297                            std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4298                        };
4299                    } else if m2 != 0 {
4300                        unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4301                    }
4302                    let c2 = m2.count_ones() as usize;
4303
4304                    if m3 == 0xFF {
4305                        unsafe {
4306                            std::ptr::copy_nonoverlapping(
4307                                sp.add(ri + 24),
4308                                dp.add(wp + c0 + c1 + c2),
4309                                8,
4310                            )
4311                        };
4312                    } else if m3 != 0 {
4313                        unsafe {
4314                            compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4315                        };
4316                    }
4317                    let c3 = m3.count_ones() as usize;
4318                    wp += c0 + c1 + c2 + c3;
4319                }
4320                ri += 32;
4321            }
4322        }
4323
4324        // Scalar tail
4325        while ri < len {
4326            if wp + 1 > COMPACT_BUF {
4327                writer.write_all(&outbuf[..wp])?;
4328                wp = 0;
4329            }
4330            let b = unsafe { *sp.add(ri) };
4331            unsafe { *dp.add(wp) = b };
4332            wp += (b < lo || b > hi) as usize;
4333            ri += 1;
4334        }
4335
4336        if wp > 0 {
4337            writer.write_all(&outbuf[..wp])?;
4338        }
4339        return Ok(());
4340    }
4341
4342    #[cfg(not(target_arch = "x86_64"))]
4343    {
4344        // Non-x86 fallback: chunk the source and process with delete_range_chunk
4345        for chunk in data.chunks(COMPACT_BUF) {
4346            let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4347            if clen > 0 {
4348                writer.write_all(&outbuf[..clen])?;
4349            }
4350        }
4351        return Ok(());
4352    }
4353
4354    #[allow(unreachable_code)]
4355    Ok(())
4356}
4357
4358/// Zero-copy range delete for mmap data: SIMD-scans for bytes in [lo..=hi],
4359/// builds IoSlice entries pointing to the gaps between deleted ranges in the
4360/// original mmap data, and writes using writev. No output buffer allocation.
4361/// For 10MB text with 4% digits: ~1.5ms vs ~4ms for the compact approach.
4362fn delete_range_mmap_zerocopy(
4363    data: &[u8],
4364    writer: &mut impl Write,
4365    lo: u8,
4366    hi: u8,
4367) -> io::Result<()> {
4368    #[cfg(target_arch = "x86_64")]
4369    {
4370        if get_simd_level() >= 3 {
4371            return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4372        }
4373        if get_simd_level() >= 2 {
4374            return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4375        }
4376    }
4377
4378    #[cfg(target_arch = "aarch64")]
4379    {
4380        return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4381    }
4382
4383    // Scalar fallback: byte-by-byte scan with IoSlice batching
4384    #[allow(unreachable_code)]
4385    delete_range_zerocopy_scalar(data, writer, lo, hi)
4386}
4387
4388/// Scalar zero-copy range delete: byte-by-byte scan with IoSlice batching.
4389/// Used as fallback when SIMD is unavailable.
4390fn delete_range_zerocopy_scalar(
4391    data: &[u8],
4392    writer: &mut impl Write,
4393    lo: u8,
4394    hi: u8,
4395) -> io::Result<()> {
4396    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4397    let len = data.len();
4398    let mut run_start: usize = 0;
4399    let mut i: usize = 0;
4400
4401    while i < len {
4402        let b = unsafe { *data.get_unchecked(i) };
4403        if b >= lo && b <= hi {
4404            if i > run_start {
4405                iov.push(std::io::IoSlice::new(&data[run_start..i]));
4406                if iov.len() >= MAX_IOV {
4407                    write_ioslices(writer, &iov)?;
4408                    iov.clear();
4409                }
4410            }
4411            run_start = i + 1;
4412        }
4413        i += 1;
4414    }
4415    if run_start < len {
4416        iov.push(std::io::IoSlice::new(&data[run_start..]));
4417    }
4418    if !iov.is_empty() {
4419        write_ioslices(writer, &iov)?;
4420    }
4421    Ok(())
4422}
4423
4424/// AVX2 zero-copy range delete: scans 32 bytes at a time using SIMD range
4425/// comparison, then iterates only the delete positions from the bitmask.
4426/// Blocks with no deletes (common for sparse data) skip with zero per-byte work.
4427#[cfg(target_arch = "x86_64")]
4428#[target_feature(enable = "avx2")]
4429unsafe fn delete_range_zerocopy_avx2(
4430    data: &[u8],
4431    writer: &mut impl Write,
4432    lo: u8,
4433    hi: u8,
4434) -> io::Result<()> {
4435    use std::arch::x86_64::*;
4436
4437    unsafe {
4438        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4439        let len = data.len();
4440        let mut run_start: usize = 0;
4441        let mut ri: usize = 0;
4442
4443        let range = hi - lo;
4444        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4445        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4446        let zero = _mm256_setzero_si256();
4447
4448        while ri + 32 <= len {
4449            let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4450            let biased = _mm256_add_epi8(input, bias_v);
4451            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4452            let in_range = _mm256_cmpeq_epi8(gt, zero);
4453            let del_mask = _mm256_movemask_epi8(in_range) as u32;
4454
4455            if del_mask == 0 {
4456                // No bytes to delete — run continues
4457                ri += 32;
4458                continue;
4459            }
4460
4461            // Process each deleted byte position from the bitmask
4462            let mut m = del_mask;
4463            while m != 0 {
4464                let bit = m.trailing_zeros() as usize;
4465                let abs_pos = ri + bit;
4466                if abs_pos > run_start {
4467                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4468                    if iov.len() >= MAX_IOV {
4469                        write_ioslices(writer, &iov)?;
4470                        iov.clear();
4471                    }
4472                }
4473                run_start = abs_pos + 1;
4474                m &= m - 1; // clear lowest set bit (blsr)
4475            }
4476
4477            ri += 32;
4478        }
4479
4480        // Scalar tail
4481        while ri < len {
4482            let b = *data.get_unchecked(ri);
4483            if b >= lo && b <= hi {
4484                if ri > run_start {
4485                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4486                    if iov.len() >= MAX_IOV {
4487                        write_ioslices(writer, &iov)?;
4488                        iov.clear();
4489                    }
4490                }
4491                run_start = ri + 1;
4492            }
4493            ri += 1;
4494        }
4495
4496        if run_start < len {
4497            iov.push(std::io::IoSlice::new(&data[run_start..]));
4498        }
4499        if !iov.is_empty() {
4500            write_ioslices(writer, &iov)?;
4501        }
4502        Ok(())
4503    }
4504}
4505
4506/// SSE2 zero-copy range delete: same approach as AVX2 but with 16-byte blocks.
4507#[cfg(target_arch = "x86_64")]
4508#[target_feature(enable = "sse2")]
4509unsafe fn delete_range_zerocopy_sse2(
4510    data: &[u8],
4511    writer: &mut impl Write,
4512    lo: u8,
4513    hi: u8,
4514) -> io::Result<()> {
4515    use std::arch::x86_64::*;
4516
4517    unsafe {
4518        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4519        let len = data.len();
4520        let mut run_start: usize = 0;
4521        let mut ri: usize = 0;
4522
4523        let range = hi - lo;
4524        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4525        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4526        let zero = _mm_setzero_si128();
4527
4528        while ri + 16 <= len {
4529            let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4530            let biased = _mm_add_epi8(input, bias_v);
4531            let gt = _mm_cmpgt_epi8(biased, threshold_v);
4532            let in_range = _mm_cmpeq_epi8(gt, zero);
4533            let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4534
4535            if del_mask == 0 {
4536                ri += 16;
4537                continue;
4538            }
4539
4540            let mut m = del_mask;
4541            while m != 0 {
4542                let bit = m.trailing_zeros() as usize;
4543                let abs_pos = ri + bit;
4544                if abs_pos > run_start {
4545                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4546                    if iov.len() >= MAX_IOV {
4547                        write_ioslices(writer, &iov)?;
4548                        iov.clear();
4549                    }
4550                }
4551                run_start = abs_pos + 1;
4552                m &= m - 1;
4553            }
4554
4555            ri += 16;
4556        }
4557
4558        while ri < len {
4559            let b = *data.get_unchecked(ri);
4560            if b >= lo && b <= hi {
4561                if ri > run_start {
4562                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4563                    if iov.len() >= MAX_IOV {
4564                        write_ioslices(writer, &iov)?;
4565                        iov.clear();
4566                    }
4567                }
4568                run_start = ri + 1;
4569            }
4570            ri += 1;
4571        }
4572
4573        if run_start < len {
4574            iov.push(std::io::IoSlice::new(&data[run_start..]));
4575        }
4576        if !iov.is_empty() {
4577            write_ioslices(writer, &iov)?;
4578        }
4579        Ok(())
4580    }
4581}
4582
4583/// NEON zero-copy range delete for aarch64: scans 16 bytes at a time using
4584/// NEON unsigned comparison, creates bitmask via pairwise narrowing, then
4585/// iterates delete positions from the bitmask.
4586#[cfg(target_arch = "aarch64")]
4587#[target_feature(enable = "neon")]
4588unsafe fn delete_range_zerocopy_neon(
4589    data: &[u8],
4590    writer: &mut impl Write,
4591    lo: u8,
4592    hi: u8,
4593) -> io::Result<()> {
4594    use std::arch::aarch64::*;
4595
4596    unsafe {
4597        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4598        let len = data.len();
4599        let mut run_start: usize = 0;
4600        let mut ri: usize = 0;
4601
4602        let lo_v = vdupq_n_u8(lo);
4603        let hi_v = vdupq_n_u8(hi);
4604        // Bit position mask for extracting bitmask from comparison results
4605        let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4606        let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4607
4608        while ri + 16 <= len {
4609            let input = vld1q_u8(data.as_ptr().add(ri));
4610            // in_range = 0xFF where lo <= byte <= hi
4611            let ge_lo = vcgeq_u8(input, lo_v);
4612            let le_hi = vcleq_u8(input, hi_v);
4613            let in_range = vandq_u8(ge_lo, le_hi);
4614
4615            // Create 16-bit bitmask: reduce 16 bytes to 2 bytes
4616            let bits = vandq_u8(in_range, bit_mask_v);
4617            let pair = vpaddlq_u8(bits); // u8→u16 pairwise add
4618            let quad = vpaddlq_u16(pair); // u16→u32
4619            let octet = vpaddlq_u32(quad); // u32→u64
4620            let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4621            let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4622            let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4623
4624            if del_mask == 0 {
4625                // No bytes to delete — run continues
4626                ri += 16;
4627                continue;
4628            }
4629
4630            // Process each deleted byte position
4631            let mut m = del_mask;
4632            while m != 0 {
4633                let bit = m.trailing_zeros() as usize;
4634                let abs_pos = ri + bit;
4635                if abs_pos > run_start {
4636                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4637                    if iov.len() >= MAX_IOV {
4638                        write_ioslices(writer, &iov)?;
4639                        iov.clear();
4640                    }
4641                }
4642                run_start = abs_pos + 1;
4643                m &= m - 1;
4644            }
4645
4646            ri += 16;
4647        }
4648
4649        // Scalar tail
4650        while ri < len {
4651            let b = *data.get_unchecked(ri);
4652            if b >= lo && b <= hi {
4653                if ri > run_start {
4654                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4655                    if iov.len() >= MAX_IOV {
4656                        write_ioslices(writer, &iov)?;
4657                        iov.clear();
4658                    }
4659                }
4660                run_start = ri + 1;
4661            }
4662            ri += 1;
4663        }
4664
4665        if run_start < len {
4666            iov.push(std::io::IoSlice::new(&data[run_start..]));
4667        }
4668        if !iov.is_empty() {
4669            write_ioslices(writer, &iov)?;
4670        }
4671        Ok(())
4672    }
4673}
4674
4675/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
4676/// Returns number of bytes written.
4677#[inline]
4678fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4679    let len = chunk.len();
4680    let mut out_pos = 0;
4681    let mut i = 0;
4682
4683    while i + 8 <= len {
4684        unsafe {
4685            let b0 = *chunk.get_unchecked(i);
4686            let b1 = *chunk.get_unchecked(i + 1);
4687            let b2 = *chunk.get_unchecked(i + 2);
4688            let b3 = *chunk.get_unchecked(i + 3);
4689            let b4 = *chunk.get_unchecked(i + 4);
4690            let b5 = *chunk.get_unchecked(i + 5);
4691            let b6 = *chunk.get_unchecked(i + 6);
4692            let b7 = *chunk.get_unchecked(i + 7);
4693
4694            *outbuf.get_unchecked_mut(out_pos) = b0;
4695            out_pos += !is_member(member, b0) as usize;
4696            *outbuf.get_unchecked_mut(out_pos) = b1;
4697            out_pos += !is_member(member, b1) as usize;
4698            *outbuf.get_unchecked_mut(out_pos) = b2;
4699            out_pos += !is_member(member, b2) as usize;
4700            *outbuf.get_unchecked_mut(out_pos) = b3;
4701            out_pos += !is_member(member, b3) as usize;
4702            *outbuf.get_unchecked_mut(out_pos) = b4;
4703            out_pos += !is_member(member, b4) as usize;
4704            *outbuf.get_unchecked_mut(out_pos) = b5;
4705            out_pos += !is_member(member, b5) as usize;
4706            *outbuf.get_unchecked_mut(out_pos) = b6;
4707            out_pos += !is_member(member, b6) as usize;
4708            *outbuf.get_unchecked_mut(out_pos) = b7;
4709            out_pos += !is_member(member, b7) as usize;
4710        }
4711        i += 8;
4712    }
4713
4714    while i < len {
4715        unsafe {
4716            let b = *chunk.get_unchecked(i);
4717            *outbuf.get_unchecked_mut(out_pos) = b;
4718            out_pos += !is_member(member, b) as usize;
4719        }
4720        i += 1;
4721    }
4722
4723    out_pos
4724}
4725
4726/// Zero-copy delete for general bitset: scan for runs of kept bytes,
4727/// build IoSlice entries pointing directly into the source data.
4728/// No allocation for output data — just ~16 bytes per IoSlice entry.
4729/// Flushes in MAX_IOV-sized batches for efficient writev.
4730fn delete_bitset_zerocopy(
4731    data: &[u8],
4732    member: &[u8; 32],
4733    writer: &mut impl Write,
4734) -> io::Result<()> {
4735    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4736    let len = data.len();
4737    let mut i = 0;
4738    let mut run_start: Option<usize> = None;
4739
4740    while i < len {
4741        let b = unsafe { *data.get_unchecked(i) };
4742        if is_member(member, b) {
4743            // This byte should be deleted
4744            if let Some(rs) = run_start {
4745                iov.push(std::io::IoSlice::new(&data[rs..i]));
4746                run_start = None;
4747                if iov.len() >= MAX_IOV {
4748                    write_ioslices(writer, &iov)?;
4749                    iov.clear();
4750                }
4751            }
4752        } else {
4753            // This byte should be kept
4754            if run_start.is_none() {
4755                run_start = Some(i);
4756            }
4757        }
4758        i += 1;
4759    }
4760    // Flush final run
4761    if let Some(rs) = run_start {
4762        iov.push(std::io::IoSlice::new(&data[rs..]));
4763    }
4764    if !iov.is_empty() {
4765        write_ioslices(writer, &iov)?;
4766    }
4767    Ok(())
4768}
4769
4770fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4771    // Streaming zero-copy delete using writev: build IoSlice batches of MAX_IOV
4772    // pointing to gaps between deleted characters, write each batch immediately.
4773    // Avoids allocating the full Vec<IoSlice> for all positions.
4774    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4775    let mut last = 0;
4776    for pos in memchr::memchr_iter(ch, data) {
4777        if pos > last {
4778            iov.push(std::io::IoSlice::new(&data[last..pos]));
4779            if iov.len() >= MAX_IOV {
4780                write_ioslices(writer, &iov)?;
4781                iov.clear();
4782            }
4783        }
4784        last = pos + 1;
4785    }
4786    if last < data.len() {
4787        iov.push(std::io::IoSlice::new(&data[last..]));
4788    }
4789    if !iov.is_empty() {
4790        write_ioslices(writer, &iov)?;
4791    }
4792    Ok(())
4793}
4794
4795fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4796    let c0 = chars[0];
4797    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4798    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4799    let is_three = chars.len() >= 3;
4800
4801    // Streaming zero-copy delete: batch IoSlice entries and write in groups of MAX_IOV.
4802    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4803    let mut last = 0;
4804
4805    macro_rules! process_pos {
4806        ($pos:expr) => {
4807            if $pos > last {
4808                iov.push(std::io::IoSlice::new(&data[last..$pos]));
4809                if iov.len() >= MAX_IOV {
4810                    write_ioslices(writer, &iov)?;
4811                    iov.clear();
4812                }
4813            }
4814            last = $pos + 1;
4815        };
4816    }
4817
4818    if is_three {
4819        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4820            process_pos!(pos);
4821        }
4822    } else {
4823        for pos in memchr::memchr2_iter(c0, c1, data) {
4824            process_pos!(pos);
4825        }
4826    }
4827    if last < data.len() {
4828        iov.push(std::io::IoSlice::new(&data[last..]));
4829    }
4830    if !iov.is_empty() {
4831        write_ioslices(writer, &iov)?;
4832    }
4833    Ok(())
4834}
4835
4836/// Delete + squeeze from mmap'd byte slice.
4837///
4838/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
4839/// For data > 16MB: chunked approach to limit memory.
4840pub fn delete_squeeze_mmap(
4841    delete_chars: &[u8],
4842    squeeze_chars: &[u8],
4843    data: &[u8],
4844    writer: &mut impl Write,
4845) -> io::Result<()> {
4846    let delete_set = build_member_set(delete_chars);
4847    let squeeze_set = build_member_set(squeeze_chars);
4848
4849    // Single-allocation delete+squeeze: full-size buffer, single write_all.
4850    let mut outbuf = alloc_uninit_vec(data.len());
4851    let mut last_squeezed: u16 = 256;
4852    let mut out_pos = 0;
4853
4854    for &b in data.iter() {
4855        if is_member(&delete_set, b) {
4856            continue;
4857        }
4858        if is_member(&squeeze_set, b) {
4859            if last_squeezed == b as u16 {
4860                continue;
4861            }
4862            last_squeezed = b as u16;
4863        } else {
4864            last_squeezed = 256;
4865        }
4866        unsafe {
4867            *outbuf.get_unchecked_mut(out_pos) = b;
4868        }
4869        out_pos += 1;
4870    }
4871    writer.write_all(&outbuf[..out_pos])
4872}
4873
4874/// Squeeze from mmap'd byte slice.
4875///
4876/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
4877/// For data <= 16MB: squeeze into one buffer, one write syscall.
4878/// For data > 16MB: chunked approach to limit memory.
4879pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4880    if squeeze_chars.len() == 1 {
4881        return squeeze_single_mmap(squeeze_chars[0], data, writer);
4882    }
4883    if squeeze_chars.len() == 2 {
4884        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4885    }
4886    if squeeze_chars.len() == 3 {
4887        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4888    }
4889
4890    let member = build_member_set(squeeze_chars);
4891
4892    // Parallel path: squeeze each chunk independently, then fix boundaries
4893    if data.len() >= PARALLEL_THRESHOLD {
4894        let n_threads = rayon::current_num_threads().max(1);
4895        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4896
4897        let results: Vec<Vec<u8>> = data
4898            .par_chunks(chunk_size)
4899            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4900            .collect();
4901
4902        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
4903        // and chunk N+1 starts with same byte B, and B is in squeeze set,
4904        // skip the first byte(s) of chunk N+1 that equal B.
4905        // Collect slices for writev to minimize syscalls.
4906        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4907        for (idx, result) in results.iter().enumerate() {
4908            if result.is_empty() {
4909                continue;
4910            }
4911            if idx > 0 {
4912                // Check boundary: does previous chunk end with same squeezable byte?
4913                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4914                    if is_member(&member, prev_last) {
4915                        // Skip leading bytes in this chunk that equal prev_last
4916                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
4917                        if skip < result.len() {
4918                            slices.push(std::io::IoSlice::new(&result[skip..]));
4919                        }
4920                        continue;
4921                    }
4922                }
4923            }
4924            slices.push(std::io::IoSlice::new(result));
4925        }
4926        return write_ioslices(writer, &slices);
4927    }
4928
4929    // Single-allocation squeeze: full-size buffer, single write_all.
4930    let mut outbuf = alloc_uninit_vec(data.len());
4931    let len = data.len();
4932    let mut wp = 0;
4933    let mut i = 0;
4934    let mut last_squeezed: u16 = 256;
4935
4936    unsafe {
4937        let inp = data.as_ptr();
4938        let outp = outbuf.as_mut_ptr();
4939
4940        while i < len {
4941            let b = *inp.add(i);
4942            if is_member(&member, b) {
4943                if last_squeezed != b as u16 {
4944                    *outp.add(wp) = b;
4945                    wp += 1;
4946                    last_squeezed = b as u16;
4947                }
4948                i += 1;
4949                while i < len && *inp.add(i) == b {
4950                    i += 1;
4951                }
4952            } else {
4953                last_squeezed = 256;
4954                *outp.add(wp) = b;
4955                wp += 1;
4956                i += 1;
4957            }
4958        }
4959    }
4960    writer.write_all(&outbuf[..wp])
4961}
4962
4963/// Squeeze a single chunk using bitset membership. Returns squeezed output.
4964fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4965    let len = chunk.len();
4966    let mut out = Vec::with_capacity(len);
4967    let mut last_squeezed: u16 = 256;
4968    let mut i = 0;
4969
4970    unsafe {
4971        out.set_len(len);
4972        let inp = chunk.as_ptr();
4973        let outp: *mut u8 = out.as_mut_ptr();
4974        let mut wp = 0;
4975
4976        while i < len {
4977            let b = *inp.add(i);
4978            if is_member(member, b) {
4979                if last_squeezed != b as u16 {
4980                    *outp.add(wp) = b;
4981                    wp += 1;
4982                    last_squeezed = b as u16;
4983                }
4984                i += 1;
4985                while i < len && *inp.add(i) == b {
4986                    i += 1;
4987                }
4988            } else {
4989                last_squeezed = 256;
4990                *outp.add(wp) = b;
4991                wp += 1;
4992                i += 1;
4993            }
4994        }
4995        out.set_len(wp);
4996    }
4997    out
4998}
4999
5000fn squeeze_multi_mmap<const N: usize>(
5001    chars: &[u8],
5002    data: &[u8],
5003    writer: &mut impl Write,
5004) -> io::Result<()> {
5005    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
5006    if data.len() >= PARALLEL_THRESHOLD {
5007        let member = build_member_set(chars);
5008        let n_threads = rayon::current_num_threads().max(1);
5009        let chunk_size = (data.len() / n_threads).max(32 * 1024);
5010
5011        let results: Vec<Vec<u8>> = data
5012            .par_chunks(chunk_size)
5013            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
5014            .collect();
5015
5016        // Build IoSlice list, fixing boundaries
5017        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
5018        for (idx, result) in results.iter().enumerate() {
5019            if result.is_empty() {
5020                continue;
5021            }
5022            if idx > 0 {
5023                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
5024                    if is_member(&member, prev_last) {
5025                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
5026                        if skip < result.len() {
5027                            slices.push(std::io::IoSlice::new(&result[skip..]));
5028                        }
5029                        continue;
5030                    }
5031                }
5032            }
5033            slices.push(std::io::IoSlice::new(result));
5034        }
5035        return write_ioslices(writer, &slices);
5036    }
5037
5038    // Zero-copy writev: build IoSlice entries pointing directly into
5039    // the original mmap'd data, keeping one byte per run of squeezable chars.
5040    // Each IoSlice points at the gap between squeeze points (inclusive of
5041    // the first byte of a run) — no data is copied.
5042    let single = [chars[0]; 1]; // scratch for emitting single squeeze byte
5043    let _ = single;
5044    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
5045    let mut cursor = 0;
5046    let mut last_squeezed: u16 = 256;
5047
5048    macro_rules! find_next {
5049        ($data:expr) => {
5050            if N == 2 {
5051                memchr::memchr2(chars[0], chars[1], $data)
5052            } else {
5053                memchr::memchr3(chars[0], chars[1], chars[2], $data)
5054            }
5055        };
5056    }
5057
5058    while cursor < data.len() {
5059        match find_next!(&data[cursor..]) {
5060            Some(offset) => {
5061                let pos = cursor + offset;
5062                let b = data[pos];
5063                // Emit gap before squeeze point
5064                if pos > cursor {
5065                    iov.push(std::io::IoSlice::new(&data[cursor..pos]));
5066                    last_squeezed = 256;
5067                }
5068                // Emit single byte if not duplicate
5069                if last_squeezed != b as u16 {
5070                    // Point at the byte in the original data (zero-copy)
5071                    iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
5072                    last_squeezed = b as u16;
5073                }
5074                // Skip the run of same byte
5075                let mut skip = pos + 1;
5076                while skip < data.len() && data[skip] == b {
5077                    skip += 1;
5078                }
5079                cursor = skip;
5080                // Flush when approaching MAX_IOV
5081                if iov.len() >= MAX_IOV {
5082                    write_ioslices(writer, &iov)?;
5083                    iov.clear();
5084                }
5085            }
5086            None => {
5087                if cursor < data.len() {
5088                    iov.push(std::io::IoSlice::new(&data[cursor..]));
5089                }
5090                break;
5091            }
5092        }
5093    }
5094    if !iov.is_empty() {
5095        write_ioslices(writer, &iov)?;
5096    }
5097    Ok(())
5098}
5099
5100fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
5101    if data.is_empty() {
5102        return Ok(());
5103    }
5104
5105    // Quick check: no consecutive pairs means no squeezing needed
5106    let pair = [ch, ch];
5107    if memchr::memmem::find(data, &pair).is_none() {
5108        return writer.write_all(data);
5109    }
5110
5111    // Zero-copy writev approach: build IoSlice entries pointing directly into
5112    // the original mmap'd data, skipping duplicate bytes in runs.
5113    // For `tr -s ' '` on 10MB with ~5K squeeze points:
5114    //   - ~10K IoSlice entries (one per gap + one per squeeze point)
5115    //   - ~10 writev syscalls (at 1024 entries per batch)
5116    //   - Zero data copy — kernel reads directly from mmap pages
5117    let finder = memchr::memmem::Finder::new(&pair);
5118    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5119    let mut cursor = 0;
5120
5121    while cursor < data.len() {
5122        match finder.find(&data[cursor..]) {
5123            Some(offset) => {
5124                let pair_pos = cursor + offset;
5125                // Include everything up to and including the first byte of the pair
5126                let seg_end = pair_pos + 1;
5127                if seg_end > cursor {
5128                    iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5129                }
5130                // Skip all remaining consecutive ch bytes (the run)
5131                let mut skip = seg_end;
5132                while skip < data.len() && data[skip] == ch {
5133                    skip += 1;
5134                }
5135                cursor = skip;
5136                // Flush when approaching MAX_IOV
5137                if iov.len() >= MAX_IOV {
5138                    write_ioslices(writer, &iov)?;
5139                    iov.clear();
5140                }
5141            }
5142            None => {
5143                // No more pairs — emit remainder
5144                if cursor < data.len() {
5145                    iov.push(std::io::IoSlice::new(&data[cursor..]));
5146                }
5147                break;
5148            }
5149        }
5150    }
5151
5152    if !iov.is_empty() {
5153        write_ioslices(writer, &iov)?;
5154    }
5155    Ok(())
5156}