Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3use rayon::prelude::*;
4
5/// Maximum IoSlice entries per write_vectored batch.
6/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
7const MAX_IOV: usize = 1024;
8
9/// Stream buffer: 8MB — larger buffers reduce syscall count for piped I/O.
10/// For 10MB piped input: 2 iterations (2 reads + 2 translates + 2 writes)
11/// instead of 3 at 4MB. The extra L3 spill is offset by fewer syscalls.
12/// With enlarged pipe buffers (8MB via F_SETPIPE_SZ), each read can return
13/// up to 8MB, so one iteration handles the bulk of the data.
14const STREAM_BUF: usize = 8 * 1024 * 1024;
15
16/// Minimum data size to engage rayon parallel processing for mmap paths.
17/// AVX2 translation runs at ~10 GB/s per core. For 10MB data:
18/// - Sequential: ~1ms translate
19/// - Parallel (4 cores): ~0.25ms translate + ~0.15ms rayon overhead = ~0.4ms
20/// Net savings: ~0.6ms per translate pass. Worth it for >= 4MB files where
21/// the multi-core speedup clearly exceeds rayon spawn+join overhead.
22const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
23
24/// 256-entry lookup table for byte compaction: for each 8-bit keep mask,
25/// stores the bit positions of set bits (indices of bytes to keep).
26/// Used by compact_8bytes to replace the serial trailing_zeros loop with
27/// unconditional indexed stores, eliminating the tzcnt→blsr dependency chain.
28/// Total size: 256 * 8 = 2KB — fits entirely in L1 cache.
29#[cfg(target_arch = "x86_64")]
30static COMPACT_LUT: [[u8; 8]; 256] = {
31    let mut lut = [[0u8; 8]; 256];
32    let mut mask: u16 = 0;
33    while mask < 256 {
34        let mut idx: usize = 0;
35        let mut bit: u8 = 0;
36        while bit < 8 {
37            if (mask >> bit) & 1 != 0 {
38                lut[mask as usize][idx] = bit;
39                idx += 1;
40            }
41            bit += 1;
42        }
43        mask += 1;
44    }
45    lut
46};
47
48/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
49/// Falls back to write_all per slice for partial writes.
50#[inline]
51fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
52    if slices.is_empty() {
53        return Ok(());
54    }
55    for batch in slices.chunks(MAX_IOV) {
56        let total: usize = batch.iter().map(|s| s.len()).sum();
57        match writer.write_vectored(batch) {
58            Ok(n) if n >= total => continue,
59            Ok(mut written) => {
60                // Partial write: fall back to write_all per remaining slice
61                for slice in batch {
62                    let slen = slice.len();
63                    if written >= slen {
64                        written -= slen;
65                        continue;
66                    }
67                    if written > 0 {
68                        writer.write_all(&slice[written..])?;
69                        written = 0;
70                    } else {
71                        writer.write_all(slice)?;
72                    }
73                }
74            }
75            Err(e) => return Err(e),
76        }
77    }
78    Ok(())
79}
80
81/// Allocate a Vec<u8> of given length without zero-initialization.
82/// Uses MADV_HUGEPAGE on Linux for buffers >= 2MB to reduce TLB misses.
83/// SAFETY: Caller must write all bytes before reading them.
84#[inline]
85#[allow(clippy::uninit_vec)]
86fn alloc_uninit_vec(len: usize) -> Vec<u8> {
87    let mut v = Vec::with_capacity(len);
88    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
89    unsafe {
90        v.set_len(len);
91    }
92    #[cfg(target_os = "linux")]
93    if len >= 2 * 1024 * 1024 {
94        unsafe {
95            libc::madvise(
96                v.as_mut_ptr() as *mut libc::c_void,
97                len,
98                libc::MADV_HUGEPAGE,
99            );
100        }
101    }
102    v
103}
104
105/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
106#[inline]
107fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
108    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
109    let last = set2.last().copied();
110    for (i, &from) in set1.iter().enumerate() {
111        table[from as usize] = if i < set2.len() {
112            set2[i]
113        } else {
114            last.unwrap_or(from)
115        };
116    }
117    table
118}
119
120/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
121#[inline]
122fn build_member_set(chars: &[u8]) -> [u8; 32] {
123    let mut set = [0u8; 32];
124    for &ch in chars {
125        set[ch as usize >> 3] |= 1 << (ch & 7);
126    }
127    set
128}
129
130#[inline(always)]
131fn is_member(set: &[u8; 32], ch: u8) -> bool {
132    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
133}
134
135/// Cached SIMD capability level for x86_64.
136/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
137#[cfg(target_arch = "x86_64")]
138static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
139
140#[cfg(target_arch = "x86_64")]
141#[inline(always)]
142fn get_simd_level() -> u8 {
143    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
144    if level != 0 {
145        return level;
146    }
147    let detected = if is_x86_feature_detected!("avx2") {
148        3
149    } else if is_x86_feature_detected!("ssse3") {
150        2
151    } else {
152        1
153    };
154    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
155    detected
156}
157
158/// Count how many entries in the translate table are non-identity.
159#[cfg(target_arch = "x86_64")]
160#[inline]
161fn count_non_identity(table: &[u8; 256]) -> usize {
162    table
163        .iter()
164        .enumerate()
165        .filter(|&(i, &v)| v != i as u8)
166        .count()
167}
168
169/// Translate bytes in-place using a 256-byte lookup table.
170/// For sparse translations (few bytes change), uses SIMD skip-ahead:
171/// compare 32 bytes at a time against identity, skip unchanged chunks.
172/// For dense translations, uses full SIMD nibble decomposition.
173/// Falls back to 8x-unrolled scalar on non-x86_64 platforms.
174#[inline(always)]
175fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
176    #[cfg(target_arch = "x86_64")]
177    {
178        let level = get_simd_level();
179        if level >= 3 {
180            // For sparse translations (<=16 non-identity entries), the skip-ahead
181            // approach is faster: load 32 bytes, do a full nibble lookup, compare
182            // against input, skip store if identical. This avoids writing to pages
183            // that don't change (important for MAP_PRIVATE COW mmap).
184            let non_id = count_non_identity(table);
185            if non_id > 0 && non_id <= 16 {
186                unsafe { translate_inplace_avx2_sparse(data, table) };
187                return;
188            }
189            unsafe { translate_inplace_avx2_table(data, table) };
190            return;
191        }
192        if level >= 2 {
193            unsafe { translate_inplace_ssse3_table(data, table) };
194            return;
195        }
196    }
197    translate_inplace_scalar(data, table);
198}
199
200/// Sparse AVX2 translate: skip unchanged 32-byte chunks.
201/// For each chunk: perform full nibble lookup, compare result vs input.
202/// If identical (no bytes changed), skip the store entirely.
203/// This reduces memory bandwidth and avoids COW page faults for
204/// MAP_PRIVATE mmaps when most bytes are unchanged.
205#[cfg(target_arch = "x86_64")]
206#[target_feature(enable = "avx2")]
207unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
208    use std::arch::x86_64::*;
209
210    unsafe {
211        let len = data.len();
212        let ptr = data.as_mut_ptr();
213
214        // Pre-build 16 lookup vectors (same as full nibble decomposition)
215        let mut lut = [_mm256_setzero_si256(); 16];
216        for h in 0u8..16 {
217            let base = (h as usize) * 16;
218            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
219            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
220            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
221        }
222
223        let lo_mask = _mm256_set1_epi8(0x0F);
224
225        let mut i = 0;
226        while i + 32 <= len {
227            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
228            let lo_nibble = _mm256_and_si256(input, lo_mask);
229            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
230
231            let mut result = _mm256_setzero_si256();
232            macro_rules! do_nibble {
233                ($h:expr) => {
234                    let h_val = _mm256_set1_epi8($h as i8);
235                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
236                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
237                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
238                };
239            }
240            do_nibble!(0);
241            do_nibble!(1);
242            do_nibble!(2);
243            do_nibble!(3);
244            do_nibble!(4);
245            do_nibble!(5);
246            do_nibble!(6);
247            do_nibble!(7);
248            do_nibble!(8);
249            do_nibble!(9);
250            do_nibble!(10);
251            do_nibble!(11);
252            do_nibble!(12);
253            do_nibble!(13);
254            do_nibble!(14);
255            do_nibble!(15);
256
257            // Only store if result differs from input (skip unchanged chunks)
258            let diff = _mm256_xor_si256(input, result);
259            if _mm256_testz_si256(diff, diff) == 0 {
260                _mm256_storeu_si256(ptr.add(i) as *mut _, result);
261            }
262            i += 32;
263        }
264
265        // Scalar tail
266        while i < len {
267            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
268            i += 1;
269        }
270    }
271}
272
273/// Scalar fallback: 8x-unrolled table lookup.
274#[cfg(not(target_arch = "aarch64"))]
275#[inline(always)]
276fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
277    let len = data.len();
278    let ptr = data.as_mut_ptr();
279    let mut i = 0;
280    unsafe {
281        while i + 8 <= len {
282            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
283            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
284            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
285            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
286            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
287            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
288            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
289            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
290            i += 8;
291        }
292        while i < len {
293            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
294            i += 1;
295        }
296    }
297}
298
299/// ARM64 NEON table lookup using nibble decomposition (same algorithm as x86 pshufb).
300/// Uses vqtbl1q_u8 for 16-byte table lookups, processes 16 bytes per iteration.
301#[cfg(target_arch = "aarch64")]
302#[inline(always)]
303fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
304    unsafe { translate_inplace_neon_table(data, table) };
305}
306
307#[cfg(target_arch = "aarch64")]
308#[target_feature(enable = "neon")]
309unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
310    use std::arch::aarch64::*;
311
312    unsafe {
313        let len = data.len();
314        let ptr = data.as_mut_ptr();
315
316        // Pre-build 16 NEON lookup vectors (one per high nibble)
317        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
318        for h in 0u8..16 {
319            let base = (h as usize) * 16;
320            lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
321        }
322
323        let lo_mask = vdupq_n_u8(0x0F);
324        let mut i = 0;
325
326        while i + 16 <= len {
327            let input = vld1q_u8(ptr.add(i));
328            let lo_nibble = vandq_u8(input, lo_mask);
329            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
330
331            let mut result = vdupq_n_u8(0);
332            macro_rules! do_nibble {
333                ($h:expr) => {
334                    let h_val = vdupq_n_u8($h);
335                    let mask = vceqq_u8(hi_nibble, h_val);
336                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
337                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
338                };
339            }
340            do_nibble!(0);
341            do_nibble!(1);
342            do_nibble!(2);
343            do_nibble!(3);
344            do_nibble!(4);
345            do_nibble!(5);
346            do_nibble!(6);
347            do_nibble!(7);
348            do_nibble!(8);
349            do_nibble!(9);
350            do_nibble!(10);
351            do_nibble!(11);
352            do_nibble!(12);
353            do_nibble!(13);
354            do_nibble!(14);
355            do_nibble!(15);
356
357            vst1q_u8(ptr.add(i), result);
358            i += 16;
359        }
360
361        // Scalar tail
362        while i < len {
363            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
364            i += 1;
365        }
366    }
367}
368
369// ============================================================================
370// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
371// ============================================================================
372//
373// For an arbitrary 256-byte lookup table, we decompose each byte into
374// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
375// SIMD vectors, one for each high nibble value h (0..15), containing
376// the 16 table entries table[h*16+0..h*16+15]. Then for each input
377// vector we:
378//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
379//   2. Extract high nibble (shift right 4) -> used to select which table
380//   3. For each of the 16 high nibble values, create a mask where
381//      the high nibble equals that value, pshufb the corresponding
382//      table, and accumulate results
383//
384// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
385// With instruction-level parallelism, this achieves much higher throughput
386// than scalar table lookups which have serial data dependencies.
387
388#[cfg(target_arch = "x86_64")]
389#[target_feature(enable = "avx2")]
390unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
391    use std::arch::x86_64::*;
392
393    unsafe {
394        let len = data.len();
395        let ptr = data.as_mut_ptr();
396
397        // Pre-build 16 lookup vectors, one per high nibble value.
398        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
399        // 16 table entries for pshufb indexing by low nibble.
400        let mut lut = [_mm256_setzero_si256(); 16];
401        for h in 0u8..16 {
402            let base = (h as usize) * 16;
403            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
404            // Broadcast the 128-bit row to both lanes of the 256-bit vector
405            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
406            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
407        }
408
409        let lo_mask = _mm256_set1_epi8(0x0F);
410
411        let mut i = 0;
412
413        // 2x unrolled: process 64 bytes (2x32) per iteration for better ILP.
414        // The CPU can overlap load/compute of the second vector while the first
415        // is in the nibble decomposition pipeline.
416        while i + 64 <= len {
417            let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
418            let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
419
420            let lo0 = _mm256_and_si256(input0, lo_mask);
421            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
422            let lo1 = _mm256_and_si256(input1, lo_mask);
423            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
424
425            let mut r0 = _mm256_setzero_si256();
426            let mut r1 = _mm256_setzero_si256();
427
428            macro_rules! do_nibble2 {
429                ($h:expr) => {
430                    let h_val = _mm256_set1_epi8($h as i8);
431                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
432                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
433                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
434                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
435                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
436                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
437                };
438            }
439            do_nibble2!(0);
440            do_nibble2!(1);
441            do_nibble2!(2);
442            do_nibble2!(3);
443            do_nibble2!(4);
444            do_nibble2!(5);
445            do_nibble2!(6);
446            do_nibble2!(7);
447            do_nibble2!(8);
448            do_nibble2!(9);
449            do_nibble2!(10);
450            do_nibble2!(11);
451            do_nibble2!(12);
452            do_nibble2!(13);
453            do_nibble2!(14);
454            do_nibble2!(15);
455
456            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
457            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
458            i += 64;
459        }
460
461        // Remaining 32-byte chunk
462        if i + 32 <= len {
463            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
464            let lo_nibble = _mm256_and_si256(input, lo_mask);
465            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
466
467            let mut result = _mm256_setzero_si256();
468
469            macro_rules! do_nibble {
470                ($h:expr) => {
471                    let h_val = _mm256_set1_epi8($h as i8);
472                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
473                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
474                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
475                };
476            }
477            do_nibble!(0);
478            do_nibble!(1);
479            do_nibble!(2);
480            do_nibble!(3);
481            do_nibble!(4);
482            do_nibble!(5);
483            do_nibble!(6);
484            do_nibble!(7);
485            do_nibble!(8);
486            do_nibble!(9);
487            do_nibble!(10);
488            do_nibble!(11);
489            do_nibble!(12);
490            do_nibble!(13);
491            do_nibble!(14);
492            do_nibble!(15);
493
494            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
495            i += 32;
496        }
497
498        // SSE/SSSE3 tail for remaining 16-byte chunk
499        if i + 16 <= len {
500            let lo_mask128 = _mm_set1_epi8(0x0F);
501
502            let mut lut128 = [_mm_setzero_si128(); 16];
503            for h in 0u8..16 {
504                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
505            }
506
507            let input = _mm_loadu_si128(ptr.add(i) as *const _);
508            let lo_nib = _mm_and_si128(input, lo_mask128);
509            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
510
511            let mut res = _mm_setzero_si128();
512            macro_rules! do_nibble128 {
513                ($h:expr) => {
514                    let h_val = _mm_set1_epi8($h as i8);
515                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
516                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
517                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
518                };
519            }
520            do_nibble128!(0);
521            do_nibble128!(1);
522            do_nibble128!(2);
523            do_nibble128!(3);
524            do_nibble128!(4);
525            do_nibble128!(5);
526            do_nibble128!(6);
527            do_nibble128!(7);
528            do_nibble128!(8);
529            do_nibble128!(9);
530            do_nibble128!(10);
531            do_nibble128!(11);
532            do_nibble128!(12);
533            do_nibble128!(13);
534            do_nibble128!(14);
535            do_nibble128!(15);
536
537            _mm_storeu_si128(ptr.add(i) as *mut _, res);
538            i += 16;
539        }
540
541        // Scalar tail
542        while i < len {
543            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
544            i += 1;
545        }
546    }
547}
548
549#[cfg(target_arch = "x86_64")]
550#[target_feature(enable = "ssse3")]
551unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
552    use std::arch::x86_64::*;
553
554    unsafe {
555        let len = data.len();
556        let ptr = data.as_mut_ptr();
557
558        // Pre-build 16 lookup vectors for pshufb
559        let mut lut = [_mm_setzero_si128(); 16];
560        for h in 0u8..16 {
561            let base = (h as usize) * 16;
562            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
563            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
564        }
565
566        let lo_mask = _mm_set1_epi8(0x0F);
567
568        let mut i = 0;
569        while i + 16 <= len {
570            let input = _mm_loadu_si128(ptr.add(i) as *const _);
571            let lo_nibble = _mm_and_si128(input, lo_mask);
572            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
573
574            let mut result = _mm_setzero_si128();
575
576            macro_rules! do_nibble {
577                ($h:expr) => {
578                    let h_val = _mm_set1_epi8($h as i8);
579                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
580                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
581                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
582                };
583            }
584            do_nibble!(0);
585            do_nibble!(1);
586            do_nibble!(2);
587            do_nibble!(3);
588            do_nibble!(4);
589            do_nibble!(5);
590            do_nibble!(6);
591            do_nibble!(7);
592            do_nibble!(8);
593            do_nibble!(9);
594            do_nibble!(10);
595            do_nibble!(11);
596            do_nibble!(12);
597            do_nibble!(13);
598            do_nibble!(14);
599            do_nibble!(15);
600
601            _mm_storeu_si128(ptr.add(i) as *mut _, result);
602            i += 16;
603        }
604
605        // Scalar tail
606        while i < len {
607            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
608            i += 1;
609        }
610    }
611}
612
613/// Translate bytes from source to destination using a 256-byte lookup table.
614/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
615#[inline(always)]
616fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
617    debug_assert!(dst.len() >= src.len());
618    #[cfg(target_arch = "x86_64")]
619    {
620        let level = get_simd_level();
621        if level >= 3 {
622            // Use nontemporal stores when dst is 32-byte aligned (large Vec allocations)
623            if dst.as_ptr() as usize & 31 == 0 {
624                unsafe { translate_to_avx2_table_nt(src, dst, table) };
625            } else {
626                unsafe { translate_to_avx2_table(src, dst, table) };
627            }
628            return;
629        }
630        if level >= 2 {
631            unsafe { translate_to_ssse3_table(src, dst, table) };
632            return;
633        }
634    }
635    translate_to_scalar(src, dst, table);
636}
637
638/// Scalar fallback for translate_to.
639#[cfg(not(target_arch = "aarch64"))]
640#[inline(always)]
641fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
642    unsafe {
643        let sp = src.as_ptr();
644        let dp = dst.as_mut_ptr();
645        let len = src.len();
646        let mut i = 0;
647        while i + 8 <= len {
648            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
649            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
650            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
651            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
652            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
653            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
654            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
655            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
656            i += 8;
657        }
658        while i < len {
659            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
660            i += 1;
661        }
662    }
663}
664
665/// ARM64 NEON table-lookup translate_to using nibble decomposition.
666#[cfg(target_arch = "aarch64")]
667#[inline(always)]
668fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
669    unsafe { translate_to_neon_table(src, dst, table) };
670}
671
672#[cfg(target_arch = "aarch64")]
673#[target_feature(enable = "neon")]
674unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
675    use std::arch::aarch64::*;
676
677    unsafe {
678        let len = src.len();
679        let sp = src.as_ptr();
680        let dp = dst.as_mut_ptr();
681
682        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
683        for h in 0u8..16 {
684            lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
685        }
686
687        let lo_mask = vdupq_n_u8(0x0F);
688        let mut i = 0;
689
690        while i + 16 <= len {
691            let input = vld1q_u8(sp.add(i));
692            let lo_nibble = vandq_u8(input, lo_mask);
693            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
694
695            let mut result = vdupq_n_u8(0);
696            macro_rules! do_nibble {
697                ($h:expr) => {
698                    let h_val = vdupq_n_u8($h);
699                    let mask = vceqq_u8(hi_nibble, h_val);
700                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
701                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
702                };
703            }
704            do_nibble!(0);
705            do_nibble!(1);
706            do_nibble!(2);
707            do_nibble!(3);
708            do_nibble!(4);
709            do_nibble!(5);
710            do_nibble!(6);
711            do_nibble!(7);
712            do_nibble!(8);
713            do_nibble!(9);
714            do_nibble!(10);
715            do_nibble!(11);
716            do_nibble!(12);
717            do_nibble!(13);
718            do_nibble!(14);
719            do_nibble!(15);
720
721            vst1q_u8(dp.add(i), result);
722            i += 16;
723        }
724
725        while i < len {
726            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
727            i += 1;
728        }
729    }
730}
731
732#[cfg(target_arch = "x86_64")]
733#[target_feature(enable = "avx2")]
734unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
735    use std::arch::x86_64::*;
736
737    unsafe {
738        let len = src.len();
739        let sp = src.as_ptr();
740        let dp = dst.as_mut_ptr();
741
742        // Pre-build 16 lookup vectors
743        let mut lut = [_mm256_setzero_si256(); 16];
744        for h in 0u8..16 {
745            let base = (h as usize) * 16;
746            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
747            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
748            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
749        }
750
751        let lo_mask = _mm256_set1_epi8(0x0F);
752
753        let mut i = 0;
754
755        // 2x unrolled: process 64 bytes per iteration for better ILP
756        while i + 64 <= len {
757            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
758            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
759
760            let lo0 = _mm256_and_si256(input0, lo_mask);
761            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
762            let lo1 = _mm256_and_si256(input1, lo_mask);
763            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
764
765            let mut r0 = _mm256_setzero_si256();
766            let mut r1 = _mm256_setzero_si256();
767
768            macro_rules! do_nibble2 {
769                ($h:expr) => {
770                    let h_val = _mm256_set1_epi8($h as i8);
771                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
772                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
773                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
774                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
775                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
776                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
777                };
778            }
779            do_nibble2!(0);
780            do_nibble2!(1);
781            do_nibble2!(2);
782            do_nibble2!(3);
783            do_nibble2!(4);
784            do_nibble2!(5);
785            do_nibble2!(6);
786            do_nibble2!(7);
787            do_nibble2!(8);
788            do_nibble2!(9);
789            do_nibble2!(10);
790            do_nibble2!(11);
791            do_nibble2!(12);
792            do_nibble2!(13);
793            do_nibble2!(14);
794            do_nibble2!(15);
795
796            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
797            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
798            i += 64;
799        }
800
801        // Remaining 32-byte chunk
802        if i + 32 <= len {
803            let input = _mm256_loadu_si256(sp.add(i) as *const _);
804            let lo_nibble = _mm256_and_si256(input, lo_mask);
805            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
806
807            let mut result = _mm256_setzero_si256();
808
809            macro_rules! do_nibble {
810                ($h:expr) => {
811                    let h_val = _mm256_set1_epi8($h as i8);
812                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
813                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
814                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
815                };
816            }
817            do_nibble!(0);
818            do_nibble!(1);
819            do_nibble!(2);
820            do_nibble!(3);
821            do_nibble!(4);
822            do_nibble!(5);
823            do_nibble!(6);
824            do_nibble!(7);
825            do_nibble!(8);
826            do_nibble!(9);
827            do_nibble!(10);
828            do_nibble!(11);
829            do_nibble!(12);
830            do_nibble!(13);
831            do_nibble!(14);
832            do_nibble!(15);
833
834            _mm256_storeu_si256(dp.add(i) as *mut _, result);
835            i += 32;
836        }
837
838        // SSSE3 tail for remaining 16-byte chunk
839        if i + 16 <= len {
840            let lo_mask128 = _mm_set1_epi8(0x0F);
841            let mut lut128 = [_mm_setzero_si128(); 16];
842            for h in 0u8..16 {
843                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
844            }
845
846            let input = _mm_loadu_si128(sp.add(i) as *const _);
847            let lo_nib = _mm_and_si128(input, lo_mask128);
848            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
849
850            let mut res = _mm_setzero_si128();
851            macro_rules! do_nibble128 {
852                ($h:expr) => {
853                    let h_val = _mm_set1_epi8($h as i8);
854                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
855                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
856                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
857                };
858            }
859            do_nibble128!(0);
860            do_nibble128!(1);
861            do_nibble128!(2);
862            do_nibble128!(3);
863            do_nibble128!(4);
864            do_nibble128!(5);
865            do_nibble128!(6);
866            do_nibble128!(7);
867            do_nibble128!(8);
868            do_nibble128!(9);
869            do_nibble128!(10);
870            do_nibble128!(11);
871            do_nibble128!(12);
872            do_nibble128!(13);
873            do_nibble128!(14);
874            do_nibble128!(15);
875
876            _mm_storeu_si128(dp.add(i) as *mut _, res);
877            i += 16;
878        }
879
880        // Scalar tail
881        while i < len {
882            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
883            i += 1;
884        }
885    }
886}
887
888/// Nontemporal variant of translate_to_avx2_table: uses _mm256_stream_si256 for stores.
889/// Avoids RFO cache traffic for the destination buffer in streaming translate operations.
890#[cfg(target_arch = "x86_64")]
891#[target_feature(enable = "avx2")]
892unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
893    use std::arch::x86_64::*;
894
895    unsafe {
896        let len = src.len();
897        let sp = src.as_ptr();
898        let dp = dst.as_mut_ptr();
899
900        // Pre-build 16 lookup vectors
901        let mut lut = [_mm256_setzero_si256(); 16];
902        for h in 0u8..16 {
903            let base = (h as usize) * 16;
904            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
905            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
906            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
907        }
908
909        let lo_mask = _mm256_set1_epi8(0x0F);
910        let mut i = 0;
911
912        // 2x unrolled with nontemporal stores
913        while i + 64 <= len {
914            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
915            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
916
917            let lo0 = _mm256_and_si256(input0, lo_mask);
918            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
919            let lo1 = _mm256_and_si256(input1, lo_mask);
920            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
921
922            let mut r0 = _mm256_setzero_si256();
923            let mut r1 = _mm256_setzero_si256();
924
925            macro_rules! do_nibble2 {
926                ($h:expr) => {
927                    let h_val = _mm256_set1_epi8($h as i8);
928                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
929                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
930                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
931                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
932                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
933                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
934                };
935            }
936            do_nibble2!(0);
937            do_nibble2!(1);
938            do_nibble2!(2);
939            do_nibble2!(3);
940            do_nibble2!(4);
941            do_nibble2!(5);
942            do_nibble2!(6);
943            do_nibble2!(7);
944            do_nibble2!(8);
945            do_nibble2!(9);
946            do_nibble2!(10);
947            do_nibble2!(11);
948            do_nibble2!(12);
949            do_nibble2!(13);
950            do_nibble2!(14);
951            do_nibble2!(15);
952
953            _mm256_stream_si256(dp.add(i) as *mut _, r0);
954            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
955            i += 64;
956        }
957
958        // Remaining 32-byte chunk
959        if i + 32 <= len {
960            let input = _mm256_loadu_si256(sp.add(i) as *const _);
961            let lo_nibble = _mm256_and_si256(input, lo_mask);
962            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
963
964            let mut result = _mm256_setzero_si256();
965            macro_rules! do_nibble {
966                ($h:expr) => {
967                    let h_val = _mm256_set1_epi8($h as i8);
968                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
969                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
970                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
971                };
972            }
973            do_nibble!(0);
974            do_nibble!(1);
975            do_nibble!(2);
976            do_nibble!(3);
977            do_nibble!(4);
978            do_nibble!(5);
979            do_nibble!(6);
980            do_nibble!(7);
981            do_nibble!(8);
982            do_nibble!(9);
983            do_nibble!(10);
984            do_nibble!(11);
985            do_nibble!(12);
986            do_nibble!(13);
987            do_nibble!(14);
988            do_nibble!(15);
989
990            _mm256_stream_si256(dp.add(i) as *mut _, result);
991            i += 32;
992        }
993
994        // SSSE3 tail for remaining 16-byte chunk (regular store)
995        if i + 16 <= len {
996            let lo_mask128 = _mm_set1_epi8(0x0F);
997            let mut lut128 = [_mm_setzero_si128(); 16];
998            for h in 0u8..16 {
999                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
1000            }
1001
1002            let input = _mm_loadu_si128(sp.add(i) as *const _);
1003            let lo_nib = _mm_and_si128(input, lo_mask128);
1004            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
1005
1006            let mut res = _mm_setzero_si128();
1007            macro_rules! do_nibble128 {
1008                ($h:expr) => {
1009                    let h_val = _mm_set1_epi8($h as i8);
1010                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1011                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1012                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1013                };
1014            }
1015            do_nibble128!(0);
1016            do_nibble128!(1);
1017            do_nibble128!(2);
1018            do_nibble128!(3);
1019            do_nibble128!(4);
1020            do_nibble128!(5);
1021            do_nibble128!(6);
1022            do_nibble128!(7);
1023            do_nibble128!(8);
1024            do_nibble128!(9);
1025            do_nibble128!(10);
1026            do_nibble128!(11);
1027            do_nibble128!(12);
1028            do_nibble128!(13);
1029            do_nibble128!(14);
1030            do_nibble128!(15);
1031
1032            _mm_storeu_si128(dp.add(i) as *mut _, res);
1033            i += 16;
1034        }
1035
1036        // Scalar tail
1037        while i < len {
1038            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1039            i += 1;
1040        }
1041
1042        // Fence: ensure nontemporal stores are visible before write() syscall
1043        _mm_sfence();
1044    }
1045}
1046
1047#[cfg(target_arch = "x86_64")]
1048#[target_feature(enable = "ssse3")]
1049unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1050    use std::arch::x86_64::*;
1051
1052    unsafe {
1053        let len = src.len();
1054        let sp = src.as_ptr();
1055        let dp = dst.as_mut_ptr();
1056
1057        let mut lut = [_mm_setzero_si128(); 16];
1058        for h in 0u8..16 {
1059            let base = (h as usize) * 16;
1060            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1061            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1062        }
1063
1064        let lo_mask = _mm_set1_epi8(0x0F);
1065
1066        let mut i = 0;
1067        while i + 16 <= len {
1068            let input = _mm_loadu_si128(sp.add(i) as *const _);
1069            let lo_nibble = _mm_and_si128(input, lo_mask);
1070            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1071
1072            let mut result = _mm_setzero_si128();
1073
1074            macro_rules! do_nibble {
1075                ($h:expr) => {
1076                    let h_val = _mm_set1_epi8($h as i8);
1077                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1078                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1079                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1080                };
1081            }
1082            do_nibble!(0);
1083            do_nibble!(1);
1084            do_nibble!(2);
1085            do_nibble!(3);
1086            do_nibble!(4);
1087            do_nibble!(5);
1088            do_nibble!(6);
1089            do_nibble!(7);
1090            do_nibble!(8);
1091            do_nibble!(9);
1092            do_nibble!(10);
1093            do_nibble!(11);
1094            do_nibble!(12);
1095            do_nibble!(13);
1096            do_nibble!(14);
1097            do_nibble!(15);
1098
1099            _mm_storeu_si128(dp.add(i) as *mut _, result);
1100            i += 16;
1101        }
1102
1103        // Scalar tail
1104        while i < len {
1105            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1106            i += 1;
1107        }
1108    }
1109}
1110
1111// ============================================================================
1112// SIMD range translation (x86_64)
1113// ============================================================================
1114
1115/// Detect if the translate table is a single contiguous range with constant offset.
1116/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
1117/// table[i] = i + offset for all i in [lo, hi].
1118#[inline]
1119fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1120    let mut lo: Option<u8> = None;
1121    let mut hi = 0u8;
1122    let mut offset = 0i16;
1123
1124    for i in 0..256 {
1125        if table[i] != i as u8 {
1126            let diff = table[i] as i16 - i as i16;
1127            match lo {
1128                None => {
1129                    lo = Some(i as u8);
1130                    hi = i as u8;
1131                    offset = diff;
1132                }
1133                Some(_) => {
1134                    if diff != offset || i as u8 != hi.wrapping_add(1) {
1135                        return None;
1136                    }
1137                    hi = i as u8;
1138                }
1139            }
1140        }
1141    }
1142
1143    lo.map(|l| (l, hi, offset as i8))
1144}
1145
1146/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
1147/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
1148/// a range maps to one replacement character.
1149/// Returns Some((lo, hi, replacement)) if the pattern matches.
1150#[inline]
1151fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1152    let mut lo: Option<u8> = None;
1153    let mut hi = 0u8;
1154    let mut replacement = 0u8;
1155
1156    for i in 0..256 {
1157        if table[i] != i as u8 {
1158            match lo {
1159                None => {
1160                    lo = Some(i as u8);
1161                    hi = i as u8;
1162                    replacement = table[i];
1163                }
1164                Some(_) => {
1165                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1166                        return None;
1167                    }
1168                    hi = i as u8;
1169                }
1170            }
1171        }
1172    }
1173
1174    lo.map(|l| (l, hi, replacement))
1175}
1176
1177/// SIMD-accelerated range-to-constant translation.
1178/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
1179/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
1180/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
1181#[cfg(target_arch = "x86_64")]
1182fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1183    if get_simd_level() >= 3 {
1184        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1185    } else {
1186        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1187    }
1188}
1189
1190#[cfg(target_arch = "x86_64")]
1191#[target_feature(enable = "avx2")]
1192unsafe fn translate_range_to_constant_avx2_inplace(
1193    data: &mut [u8],
1194    lo: u8,
1195    hi: u8,
1196    replacement: u8,
1197) {
1198    use std::arch::x86_64::*;
1199
1200    unsafe {
1201        let range = hi - lo;
1202        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1203        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1204        let repl_v = _mm256_set1_epi8(replacement as i8);
1205        let zero = _mm256_setzero_si256();
1206
1207        let len = data.len();
1208        let ptr = data.as_mut_ptr();
1209        let mut i = 0;
1210
1211        // 2x unrolled: process 64 bytes per iteration for better ILP
1212        while i + 64 <= len {
1213            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1214            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1215            let bi0 = _mm256_add_epi8(in0, bias_v);
1216            let bi1 = _mm256_add_epi8(in1, bias_v);
1217            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1218            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1219            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1220            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1221            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1222            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1223            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1224            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1225            i += 64;
1226        }
1227
1228        // Remaining 32-byte chunk
1229        if i + 32 <= len {
1230            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1231            let biased = _mm256_add_epi8(input, bias_v);
1232            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1233            let in_range = _mm256_cmpeq_epi8(gt, zero);
1234            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1235            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1236            i += 32;
1237        }
1238
1239        if i + 16 <= len {
1240            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1241            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1242            let repl_v128 = _mm_set1_epi8(replacement as i8);
1243            let zero128 = _mm_setzero_si128();
1244
1245            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1246            let biased = _mm_add_epi8(input, bias_v128);
1247            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1248            let in_range = _mm_cmpeq_epi8(gt, zero128);
1249            let result = _mm_blendv_epi8(input, repl_v128, in_range);
1250            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1251            i += 16;
1252        }
1253
1254        while i < len {
1255            let b = *ptr.add(i);
1256            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1257            i += 1;
1258        }
1259    }
1260}
1261
1262#[cfg(target_arch = "x86_64")]
1263#[target_feature(enable = "sse2")]
1264unsafe fn translate_range_to_constant_sse2_inplace(
1265    data: &mut [u8],
1266    lo: u8,
1267    hi: u8,
1268    replacement: u8,
1269) {
1270    use std::arch::x86_64::*;
1271
1272    unsafe {
1273        let range = hi - lo;
1274        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1275        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1276        let repl_v = _mm_set1_epi8(replacement as i8);
1277        let zero = _mm_setzero_si128();
1278
1279        let len = data.len();
1280        let ptr = data.as_mut_ptr();
1281        let mut i = 0;
1282
1283        while i + 16 <= len {
1284            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1285            let biased = _mm_add_epi8(input, bias_v);
1286            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1287            // in_range mask: 0xFF where in range, 0x00 where not
1288            let in_range = _mm_cmpeq_epi8(gt, zero);
1289            // SSE2 blendv: (repl & mask) | (input & ~mask)
1290            let result = _mm_or_si128(
1291                _mm_and_si128(in_range, repl_v),
1292                _mm_andnot_si128(in_range, input),
1293            );
1294            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1295            i += 16;
1296        }
1297
1298        while i < len {
1299            let b = *ptr.add(i);
1300            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1301            i += 1;
1302        }
1303    }
1304}
1305
1306#[cfg(target_arch = "aarch64")]
1307fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1308    unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1309}
1310
1311#[cfg(target_arch = "aarch64")]
1312#[target_feature(enable = "neon")]
1313unsafe fn translate_range_to_constant_neon_inplace(
1314    data: &mut [u8],
1315    lo: u8,
1316    hi: u8,
1317    replacement: u8,
1318) {
1319    use std::arch::aarch64::*;
1320
1321    unsafe {
1322        let len = data.len();
1323        let ptr = data.as_mut_ptr();
1324        let lo_v = vdupq_n_u8(lo);
1325        let hi_v = vdupq_n_u8(hi);
1326        let repl_v = vdupq_n_u8(replacement);
1327        let mut i = 0;
1328
1329        while i + 32 <= len {
1330            let in0 = vld1q_u8(ptr.add(i));
1331            let in1 = vld1q_u8(ptr.add(i + 16));
1332            let ge0 = vcgeq_u8(in0, lo_v);
1333            let le0 = vcleq_u8(in0, hi_v);
1334            let mask0 = vandq_u8(ge0, le0);
1335            let ge1 = vcgeq_u8(in1, lo_v);
1336            let le1 = vcleq_u8(in1, hi_v);
1337            let mask1 = vandq_u8(ge1, le1);
1338            // bsl: select repl where mask, keep input where not
1339            vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1340            vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1341            i += 32;
1342        }
1343
1344        if i + 16 <= len {
1345            let input = vld1q_u8(ptr.add(i));
1346            let ge = vcgeq_u8(input, lo_v);
1347            let le = vcleq_u8(input, hi_v);
1348            let mask = vandq_u8(ge, le);
1349            vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1350            i += 16;
1351        }
1352
1353        while i < len {
1354            let b = *ptr.add(i);
1355            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1356            i += 1;
1357        }
1358    }
1359}
1360
1361#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1362fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1363    for b in data.iter_mut() {
1364        if *b >= lo && *b <= hi {
1365            *b = replacement;
1366        }
1367    }
1368}
1369
1370/// SIMD range-to-constant translation from src to dst (no intermediate copy needed).
1371/// Reads from src, writes translated result to dst in a single pass.
1372#[cfg(target_arch = "x86_64")]
1373fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1374    if get_simd_level() >= 3 {
1375        unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1376    } else {
1377        unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1378    }
1379}
1380
1381#[cfg(target_arch = "aarch64")]
1382fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1383    unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1384}
1385
1386#[cfg(target_arch = "aarch64")]
1387#[target_feature(enable = "neon")]
1388unsafe fn translate_range_to_constant_neon(
1389    src: &[u8],
1390    dst: &mut [u8],
1391    lo: u8,
1392    hi: u8,
1393    replacement: u8,
1394) {
1395    use std::arch::aarch64::*;
1396
1397    unsafe {
1398        let len = src.len();
1399        let sp = src.as_ptr();
1400        let dp = dst.as_mut_ptr();
1401        let lo_v = vdupq_n_u8(lo);
1402        let hi_v = vdupq_n_u8(hi);
1403        let repl_v = vdupq_n_u8(replacement);
1404        let mut i = 0;
1405
1406        while i + 32 <= len {
1407            let in0 = vld1q_u8(sp.add(i));
1408            let in1 = vld1q_u8(sp.add(i + 16));
1409            let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1410            let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1411            vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1412            vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1413            i += 32;
1414        }
1415
1416        if i + 16 <= len {
1417            let input = vld1q_u8(sp.add(i));
1418            let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1419            vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1420            i += 16;
1421        }
1422
1423        while i < len {
1424            let b = *sp.add(i);
1425            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1426            i += 1;
1427        }
1428    }
1429}
1430
1431#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1432fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1433    for (i, &b) in src.iter().enumerate() {
1434        unsafe {
1435            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1436        }
1437    }
1438}
1439
1440#[cfg(target_arch = "x86_64")]
1441#[target_feature(enable = "avx2")]
1442unsafe fn translate_range_to_constant_avx2(
1443    src: &[u8],
1444    dst: &mut [u8],
1445    lo: u8,
1446    hi: u8,
1447    replacement: u8,
1448) {
1449    use std::arch::x86_64::*;
1450    unsafe {
1451        let range = hi - lo;
1452        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1453        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1454        let repl_v = _mm256_set1_epi8(replacement as i8);
1455        let zero = _mm256_setzero_si256();
1456        let len = src.len();
1457        let sp = src.as_ptr();
1458        let dp = dst.as_mut_ptr();
1459        let mut i = 0;
1460        while i + 64 <= len {
1461            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1462            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1463            let bi0 = _mm256_add_epi8(in0, bias_v);
1464            let bi1 = _mm256_add_epi8(in1, bias_v);
1465            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1466            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1467            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1468            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1469            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1470            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1471            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1472            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1473            i += 64;
1474        }
1475        if i + 32 <= len {
1476            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1477            let biased = _mm256_add_epi8(input, bias_v);
1478            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1479            let in_range = _mm256_cmpeq_epi8(gt, zero);
1480            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1481            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1482            i += 32;
1483        }
1484        while i < len {
1485            let b = *sp.add(i);
1486            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1487            i += 1;
1488        }
1489    }
1490}
1491
1492#[cfg(target_arch = "x86_64")]
1493#[target_feature(enable = "sse2")]
1494unsafe fn translate_range_to_constant_sse2(
1495    src: &[u8],
1496    dst: &mut [u8],
1497    lo: u8,
1498    hi: u8,
1499    replacement: u8,
1500) {
1501    use std::arch::x86_64::*;
1502    unsafe {
1503        let range = hi - lo;
1504        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1505        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1506        let repl_v = _mm_set1_epi8(replacement as i8);
1507        let zero = _mm_setzero_si128();
1508        let len = src.len();
1509        let sp = src.as_ptr();
1510        let dp = dst.as_mut_ptr();
1511        let mut i = 0;
1512        while i + 16 <= len {
1513            let input = _mm_loadu_si128(sp.add(i) as *const _);
1514            let biased = _mm_add_epi8(input, bias_v);
1515            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1516            let in_range = _mm_cmpeq_epi8(gt, zero);
1517            let result = _mm_or_si128(
1518                _mm_and_si128(in_range, repl_v),
1519                _mm_andnot_si128(in_range, input),
1520            );
1521            _mm_storeu_si128(dp.add(i) as *mut _, result);
1522            i += 16;
1523        }
1524        while i < len {
1525            let b = *sp.add(i);
1526            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1527            i += 1;
1528        }
1529    }
1530}
1531
1532/// SIMD-accelerated range translation for mmap'd data.
1533/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
1534/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
1535/// When dst is 32-byte aligned (true for large Vec allocations from mmap), uses
1536/// nontemporal stores to bypass cache, avoiding read-for-ownership overhead and
1537/// reducing memory traffic by ~33% for streaming writes.
1538#[cfg(target_arch = "x86_64")]
1539fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1540    if get_simd_level() >= 3 {
1541        // Use nontemporal stores when dst is 32-byte aligned (typical for large allocs)
1542        if dst.as_ptr() as usize & 31 == 0 {
1543            unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1544        } else {
1545            unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1546        }
1547    } else {
1548        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1549    }
1550}
1551
1552#[cfg(target_arch = "x86_64")]
1553#[target_feature(enable = "avx2")]
1554unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1555    use std::arch::x86_64::*;
1556
1557    unsafe {
1558        let range = hi - lo;
1559        // Bias: shift range so lo maps to -128 (signed min).
1560        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
1561        // For input < lo: biased wraps to large positive (signed), > threshold.
1562        // For input > hi: biased > -128+range, > threshold.
1563        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1564        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1565        let offset_v = _mm256_set1_epi8(offset);
1566        let zero = _mm256_setzero_si256();
1567
1568        let len = src.len();
1569        let sp = src.as_ptr();
1570        let dp = dst.as_mut_ptr();
1571        let mut i = 0;
1572
1573        // 2x unrolled: process 64 bytes per iteration for better ILP.
1574        // Load/compute on the second vector while the first is in-flight.
1575        while i + 64 <= len {
1576            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1577            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1578            let bi0 = _mm256_add_epi8(in0, bias_v);
1579            let bi1 = _mm256_add_epi8(in1, bias_v);
1580            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1581            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1582            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1583            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1584            let om0 = _mm256_and_si256(m0, offset_v);
1585            let om1 = _mm256_and_si256(m1, offset_v);
1586            let r0 = _mm256_add_epi8(in0, om0);
1587            let r1 = _mm256_add_epi8(in1, om1);
1588            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1589            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1590            i += 64;
1591        }
1592
1593        // Remaining 32-byte chunk
1594        if i + 32 <= len {
1595            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1596            let biased = _mm256_add_epi8(input, bias_v);
1597            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1598            let mask = _mm256_cmpeq_epi8(gt, zero);
1599            let offset_masked = _mm256_and_si256(mask, offset_v);
1600            let result = _mm256_add_epi8(input, offset_masked);
1601            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1602            i += 32;
1603        }
1604
1605        // SSE2 tail for 16-byte remainder
1606        if i + 16 <= len {
1607            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1608            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1609            let offset_v128 = _mm_set1_epi8(offset);
1610            let zero128 = _mm_setzero_si128();
1611
1612            let input = _mm_loadu_si128(sp.add(i) as *const _);
1613            let biased = _mm_add_epi8(input, bias_v128);
1614            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1615            let mask = _mm_cmpeq_epi8(gt, zero128);
1616            let offset_masked = _mm_and_si128(mask, offset_v128);
1617            let result = _mm_add_epi8(input, offset_masked);
1618            _mm_storeu_si128(dp.add(i) as *mut _, result);
1619            i += 16;
1620        }
1621
1622        // Scalar tail
1623        while i < len {
1624            let b = *sp.add(i);
1625            *dp.add(i) = if b >= lo && b <= hi {
1626                b.wrapping_add(offset as u8)
1627            } else {
1628                b
1629            };
1630            i += 1;
1631        }
1632    }
1633}
1634
1635/// Nontemporal variant of translate_range_avx2: uses _mm256_stream_si256 for stores.
1636/// This bypasses the cache for writes, avoiding read-for-ownership (RFO) traffic on
1637/// the destination buffer. For streaming translate (src → dst, dst not read again),
1638/// this reduces memory traffic by ~33% (10MB input: 20MB vs 30MB total traffic).
1639/// Requires dst to be 32-byte aligned (guaranteed for large Vec/mmap allocations).
1640#[cfg(target_arch = "x86_64")]
1641#[target_feature(enable = "avx2")]
1642unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1643    use std::arch::x86_64::*;
1644
1645    unsafe {
1646        let range = hi - lo;
1647        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1648        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1649        let offset_v = _mm256_set1_epi8(offset);
1650        let zero = _mm256_setzero_si256();
1651
1652        let len = src.len();
1653        let sp = src.as_ptr();
1654        let dp = dst.as_mut_ptr();
1655        let mut i = 0;
1656
1657        // 2x unrolled with nontemporal stores
1658        while i + 64 <= len {
1659            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1660            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1661            let bi0 = _mm256_add_epi8(in0, bias_v);
1662            let bi1 = _mm256_add_epi8(in1, bias_v);
1663            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1664            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1665            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1666            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1667            let om0 = _mm256_and_si256(m0, offset_v);
1668            let om1 = _mm256_and_si256(m1, offset_v);
1669            let r0 = _mm256_add_epi8(in0, om0);
1670            let r1 = _mm256_add_epi8(in1, om1);
1671            _mm256_stream_si256(dp.add(i) as *mut _, r0);
1672            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1673            i += 64;
1674        }
1675
1676        // Remaining 32-byte chunk (still nontemporal if aligned)
1677        if i + 32 <= len {
1678            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1679            let biased = _mm256_add_epi8(input, bias_v);
1680            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1681            let mask = _mm256_cmpeq_epi8(gt, zero);
1682            let offset_masked = _mm256_and_si256(mask, offset_v);
1683            let result = _mm256_add_epi8(input, offset_masked);
1684            _mm256_stream_si256(dp.add(i) as *mut _, result);
1685            i += 32;
1686        }
1687
1688        // SSE2 tail for 16-byte remainder (regular store — only 16 bytes)
1689        if i + 16 <= len {
1690            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1691            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1692            let offset_v128 = _mm_set1_epi8(offset);
1693            let zero128 = _mm_setzero_si128();
1694
1695            let input = _mm_loadu_si128(sp.add(i) as *const _);
1696            let biased = _mm_add_epi8(input, bias_v128);
1697            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1698            let mask = _mm_cmpeq_epi8(gt, zero128);
1699            let offset_masked = _mm_and_si128(mask, offset_v128);
1700            let result = _mm_add_epi8(input, offset_masked);
1701            _mm_storeu_si128(dp.add(i) as *mut _, result);
1702            i += 16;
1703        }
1704
1705        // Scalar tail
1706        while i < len {
1707            let b = *sp.add(i);
1708            *dp.add(i) = if b >= lo && b <= hi {
1709                b.wrapping_add(offset as u8)
1710            } else {
1711                b
1712            };
1713            i += 1;
1714        }
1715
1716        // Fence: ensure nontemporal stores are visible before write() syscall
1717        _mm_sfence();
1718    }
1719}
1720
1721#[cfg(target_arch = "x86_64")]
1722#[target_feature(enable = "sse2")]
1723unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1724    use std::arch::x86_64::*;
1725
1726    unsafe {
1727        let range = hi - lo;
1728        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1729        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1730        let offset_v = _mm_set1_epi8(offset);
1731        let zero = _mm_setzero_si128();
1732
1733        let len = src.len();
1734        let mut i = 0;
1735
1736        while i + 16 <= len {
1737            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1738            let biased = _mm_add_epi8(input, bias_v);
1739            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1740            let mask = _mm_cmpeq_epi8(gt, zero);
1741            let offset_masked = _mm_and_si128(mask, offset_v);
1742            let result = _mm_add_epi8(input, offset_masked);
1743            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1744            i += 16;
1745        }
1746
1747        while i < len {
1748            let b = *src.get_unchecked(i);
1749            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1750                b.wrapping_add(offset as u8)
1751            } else {
1752                b
1753            };
1754            i += 1;
1755        }
1756    }
1757}
1758
1759/// ARM64 NEON-accelerated range translation.
1760/// Processes 16 bytes per iteration using vectorized range check + conditional add.
1761#[cfg(target_arch = "aarch64")]
1762fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1763    unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1764}
1765
1766#[cfg(target_arch = "aarch64")]
1767#[target_feature(enable = "neon")]
1768unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1769    use std::arch::aarch64::*;
1770
1771    unsafe {
1772        let len = src.len();
1773        let sp = src.as_ptr();
1774        let dp = dst.as_mut_ptr();
1775        let lo_v = vdupq_n_u8(lo);
1776        let hi_v = vdupq_n_u8(hi);
1777        let offset_v = vdupq_n_s8(offset);
1778        let mut i = 0;
1779
1780        // 2x unrolled: process 32 bytes per iteration
1781        while i + 32 <= len {
1782            let in0 = vld1q_u8(sp.add(i));
1783            let in1 = vld1q_u8(sp.add(i + 16));
1784            // Range check: (b >= lo) & (b <= hi)
1785            let ge0 = vcgeq_u8(in0, lo_v);
1786            let le0 = vcleq_u8(in0, hi_v);
1787            let mask0 = vandq_u8(ge0, le0);
1788            let ge1 = vcgeq_u8(in1, lo_v);
1789            let le1 = vcleq_u8(in1, hi_v);
1790            let mask1 = vandq_u8(ge1, le1);
1791            // Conditional add: in + (offset & mask)
1792            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1793            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1794            let r0 = vaddq_u8(in0, off0);
1795            let r1 = vaddq_u8(in1, off1);
1796            vst1q_u8(dp.add(i), r0);
1797            vst1q_u8(dp.add(i + 16), r1);
1798            i += 32;
1799        }
1800
1801        if i + 16 <= len {
1802            let input = vld1q_u8(sp.add(i));
1803            let ge = vcgeq_u8(input, lo_v);
1804            let le = vcleq_u8(input, hi_v);
1805            let mask = vandq_u8(ge, le);
1806            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1807            vst1q_u8(dp.add(i), vaddq_u8(input, off));
1808            i += 16;
1809        }
1810
1811        while i < len {
1812            let b = *sp.add(i);
1813            *dp.add(i) = if b >= lo && b <= hi {
1814                b.wrapping_add(offset as u8)
1815            } else {
1816                b
1817            };
1818            i += 1;
1819        }
1820    }
1821}
1822
1823/// Scalar range translation fallback for non-x86_64, non-aarch64 platforms.
1824#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1825fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1826    let offset_u8 = offset as u8;
1827    let range = hi.wrapping_sub(lo);
1828    unsafe {
1829        let sp = src.as_ptr();
1830        let dp = dst.as_mut_ptr();
1831        let len = src.len();
1832        let mut i = 0;
1833        while i + 8 <= len {
1834            macro_rules! do_byte {
1835                ($off:expr) => {{
1836                    let b = *sp.add(i + $off);
1837                    let in_range = b.wrapping_sub(lo) <= range;
1838                    *dp.add(i + $off) = if in_range {
1839                        b.wrapping_add(offset_u8)
1840                    } else {
1841                        b
1842                    };
1843                }};
1844            }
1845            do_byte!(0);
1846            do_byte!(1);
1847            do_byte!(2);
1848            do_byte!(3);
1849            do_byte!(4);
1850            do_byte!(5);
1851            do_byte!(6);
1852            do_byte!(7);
1853            i += 8;
1854        }
1855        while i < len {
1856            let b = *sp.add(i);
1857            let in_range = b.wrapping_sub(lo) <= range;
1858            *dp.add(i) = if in_range {
1859                b.wrapping_add(offset_u8)
1860            } else {
1861                b
1862            };
1863            i += 1;
1864        }
1865    }
1866}
1867
1868// ============================================================================
1869// In-place SIMD range translation (saves one buffer allocation in streaming)
1870// ============================================================================
1871
1872/// In-place SIMD-accelerated range translation.
1873/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
1874/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
1875#[cfg(target_arch = "x86_64")]
1876fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1877    if get_simd_level() >= 3 {
1878        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1879    } else {
1880        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1881    }
1882}
1883
1884#[cfg(target_arch = "x86_64")]
1885#[target_feature(enable = "avx2")]
1886unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1887    use std::arch::x86_64::*;
1888
1889    unsafe {
1890        let range = hi - lo;
1891        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1892        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1893        let offset_v = _mm256_set1_epi8(offset);
1894        let zero = _mm256_setzero_si256();
1895
1896        let len = data.len();
1897        let ptr = data.as_mut_ptr();
1898        let mut i = 0;
1899
1900        // 2x unrolled: process 64 bytes per iteration for better ILP
1901        while i + 64 <= len {
1902            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1903            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1904            let bi0 = _mm256_add_epi8(in0, bias_v);
1905            let bi1 = _mm256_add_epi8(in1, bias_v);
1906            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1907            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1908            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1909            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1910            let om0 = _mm256_and_si256(m0, offset_v);
1911            let om1 = _mm256_and_si256(m1, offset_v);
1912            let r0 = _mm256_add_epi8(in0, om0);
1913            let r1 = _mm256_add_epi8(in1, om1);
1914            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1915            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1916            i += 64;
1917        }
1918
1919        // Remaining 32-byte chunk
1920        if i + 32 <= len {
1921            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1922            let biased = _mm256_add_epi8(input, bias_v);
1923            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1924            let mask = _mm256_cmpeq_epi8(gt, zero);
1925            let offset_masked = _mm256_and_si256(mask, offset_v);
1926            let result = _mm256_add_epi8(input, offset_masked);
1927            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1928            i += 32;
1929        }
1930
1931        if i + 16 <= len {
1932            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1933            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1934            let offset_v128 = _mm_set1_epi8(offset);
1935            let zero128 = _mm_setzero_si128();
1936
1937            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1938            let biased = _mm_add_epi8(input, bias_v128);
1939            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1940            let mask = _mm_cmpeq_epi8(gt, zero128);
1941            let offset_masked = _mm_and_si128(mask, offset_v128);
1942            let result = _mm_add_epi8(input, offset_masked);
1943            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1944            i += 16;
1945        }
1946
1947        while i < len {
1948            let b = *ptr.add(i);
1949            *ptr.add(i) = if b >= lo && b <= hi {
1950                b.wrapping_add(offset as u8)
1951            } else {
1952                b
1953            };
1954            i += 1;
1955        }
1956    }
1957}
1958
1959#[cfg(target_arch = "x86_64")]
1960#[target_feature(enable = "sse2")]
1961unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1962    use std::arch::x86_64::*;
1963
1964    unsafe {
1965        let range = hi - lo;
1966        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1967        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1968        let offset_v = _mm_set1_epi8(offset);
1969        let zero = _mm_setzero_si128();
1970
1971        let len = data.len();
1972        let ptr = data.as_mut_ptr();
1973        let mut i = 0;
1974
1975        while i + 16 <= len {
1976            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1977            let biased = _mm_add_epi8(input, bias_v);
1978            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1979            let mask = _mm_cmpeq_epi8(gt, zero);
1980            let offset_masked = _mm_and_si128(mask, offset_v);
1981            let result = _mm_add_epi8(input, offset_masked);
1982            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1983            i += 16;
1984        }
1985
1986        while i < len {
1987            let b = *ptr.add(i);
1988            *ptr.add(i) = if b >= lo && b <= hi {
1989                b.wrapping_add(offset as u8)
1990            } else {
1991                b
1992            };
1993            i += 1;
1994        }
1995    }
1996}
1997
1998#[cfg(target_arch = "aarch64")]
1999fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2000    unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
2001}
2002
2003#[cfg(target_arch = "aarch64")]
2004#[target_feature(enable = "neon")]
2005unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2006    use std::arch::aarch64::*;
2007
2008    unsafe {
2009        let len = data.len();
2010        let ptr = data.as_mut_ptr();
2011        let lo_v = vdupq_n_u8(lo);
2012        let hi_v = vdupq_n_u8(hi);
2013        let offset_v = vdupq_n_s8(offset);
2014        let mut i = 0;
2015
2016        while i + 32 <= len {
2017            let in0 = vld1q_u8(ptr.add(i));
2018            let in1 = vld1q_u8(ptr.add(i + 16));
2019            let ge0 = vcgeq_u8(in0, lo_v);
2020            let le0 = vcleq_u8(in0, hi_v);
2021            let mask0 = vandq_u8(ge0, le0);
2022            let ge1 = vcgeq_u8(in1, lo_v);
2023            let le1 = vcleq_u8(in1, hi_v);
2024            let mask1 = vandq_u8(ge1, le1);
2025            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2026            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2027            vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2028            vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2029            i += 32;
2030        }
2031
2032        if i + 16 <= len {
2033            let input = vld1q_u8(ptr.add(i));
2034            let ge = vcgeq_u8(input, lo_v);
2035            let le = vcleq_u8(input, hi_v);
2036            let mask = vandq_u8(ge, le);
2037            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2038            vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2039            i += 16;
2040        }
2041
2042        while i < len {
2043            let b = *ptr.add(i);
2044            if b >= lo && b <= hi {
2045                *ptr.add(i) = b.wrapping_add(offset as u8);
2046            }
2047            i += 1;
2048        }
2049    }
2050}
2051
2052#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2053fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2054    let offset_u8 = offset as u8;
2055    let range = hi.wrapping_sub(lo);
2056    for b in data.iter_mut() {
2057        if b.wrapping_sub(lo) <= range {
2058            *b = b.wrapping_add(offset_u8);
2059        }
2060    }
2061}
2062
2063// ============================================================================
2064// SIMD range deletion (x86_64)
2065// ============================================================================
2066
2067/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
2068/// Returns Some((lo, hi)) if so. This is true for common classes:
2069/// - `[:digit:]` = 0x30..=0x39
2070/// - `a-z` = 0x61..=0x7A
2071/// - `A-Z` = 0x41..=0x5A
2072#[inline]
2073fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2074    if chars.is_empty() {
2075        return None;
2076    }
2077    let mut lo = chars[0];
2078    let mut hi = chars[0];
2079    for &c in &chars[1..] {
2080        if c < lo {
2081            lo = c;
2082        }
2083        if c > hi {
2084            hi = c;
2085        }
2086    }
2087    // Check that the range size matches the number of chars (no gaps)
2088    // Cast to usize before +1 to avoid u8 overflow when hi=255, lo=0 (range=256)
2089    if (hi as usize - lo as usize + 1) == chars.len() {
2090        Some((lo, hi))
2091    } else {
2092        None
2093    }
2094}
2095
2096/// SIMD-accelerated delete for contiguous byte ranges.
2097/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
2098/// then compacts output by skipping matched bytes.
2099#[cfg(target_arch = "x86_64")]
2100fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101    if get_simd_level() >= 3 {
2102        unsafe { delete_range_avx2(src, dst, lo, hi) }
2103    } else {
2104        unsafe { delete_range_sse2(src, dst, lo, hi) }
2105    }
2106}
2107
2108#[cfg(target_arch = "x86_64")]
2109#[target_feature(enable = "avx2")]
2110unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2111    use std::arch::x86_64::*;
2112
2113    unsafe {
2114        let range = hi - lo;
2115        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2116        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2117        let zero = _mm256_setzero_si256();
2118
2119        let len = src.len();
2120        let sp = src.as_ptr();
2121        let dp = dst.as_mut_ptr();
2122        let mut ri = 0;
2123        let mut wp = 0;
2124
2125        while ri + 32 <= len {
2126            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2127            let biased = _mm256_add_epi8(input, bias_v);
2128            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
2129            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2130            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
2131            let in_range = _mm256_cmpeq_epi8(gt, zero);
2132            // keep_mask bits: 1 = keep (NOT in range)
2133            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2134
2135            if keep_mask == 0xFFFFFFFF {
2136                // All 32 bytes are kept — bulk copy
2137                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2138                wp += 32;
2139            } else if keep_mask != 0 {
2140                // Partial keep — per-lane processing with all-keep fast paths.
2141                // For 4% delete rate, ~72% of 8-byte lanes are all-keep even
2142                // within partial 32-byte blocks. The per-lane check avoids
2143                // the LUT compact overhead for these clean lanes.
2144                let m0 = keep_mask as u8;
2145                let m1 = (keep_mask >> 8) as u8;
2146                let m2 = (keep_mask >> 16) as u8;
2147                let m3 = (keep_mask >> 24) as u8;
2148
2149                if m0 == 0xFF {
2150                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2151                } else if m0 != 0 {
2152                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2153                }
2154                let c0 = m0.count_ones() as usize;
2155
2156                if m1 == 0xFF {
2157                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2158                } else if m1 != 0 {
2159                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2160                }
2161                let c1 = m1.count_ones() as usize;
2162
2163                if m2 == 0xFF {
2164                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2165                } else if m2 != 0 {
2166                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2167                }
2168                let c2 = m2.count_ones() as usize;
2169
2170                if m3 == 0xFF {
2171                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2172                } else if m3 != 0 {
2173                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2174                }
2175                let c3 = m3.count_ones() as usize;
2176                wp += c0 + c1 + c2 + c3;
2177            }
2178            // else: keep_mask == 0 means all bytes deleted, skip entirely
2179            ri += 32;
2180        }
2181
2182        // SSE2 tail for 16-byte remainder
2183        if ri + 16 <= len {
2184            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2185            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2186            let zero128 = _mm_setzero_si128();
2187
2188            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2189            let biased = _mm_add_epi8(input, bias_v128);
2190            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2191            let in_range = _mm_cmpeq_epi8(gt, zero128);
2192            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2193
2194            if keep_mask == 0xFFFF {
2195                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2196                wp += 16;
2197            } else if keep_mask != 0 {
2198                let m0 = keep_mask as u8;
2199                let m1 = (keep_mask >> 8) as u8;
2200                if m0 == 0xFF {
2201                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2202                } else if m0 != 0 {
2203                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2204                }
2205                let c0 = m0.count_ones() as usize;
2206                if m1 == 0xFF {
2207                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2208                } else if m1 != 0 {
2209                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2210                }
2211                wp += c0 + m1.count_ones() as usize;
2212            }
2213            ri += 16;
2214        }
2215
2216        // Scalar tail — branchless: always store, advance wp only for kept bytes
2217        while ri < len {
2218            let b = *sp.add(ri);
2219            *dp.add(wp) = b;
2220            wp += (b < lo || b > hi) as usize;
2221            ri += 1;
2222        }
2223
2224        wp
2225    }
2226}
2227
2228/// Compact 8 source bytes into contiguous output bytes using a keep mask.
2229/// Each bit in `mask` indicates whether the corresponding byte should be kept.
2230/// Uses a precomputed LUT: for each 8-bit mask, the LUT stores indices of set bits.
2231/// Always performs 8 unconditional stores (extra stores past popcount are harmless
2232/// since the write pointer only advances by popcount, and subsequent lanes overwrite).
2233/// This eliminates the serial tzcnt→blsr dependency chain (~28 cycles) in favor of
2234/// independent indexed loads and stores (~15 cycles).
2235#[cfg(target_arch = "x86_64")]
2236#[inline(always)]
2237unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2238    unsafe {
2239        let idx = COMPACT_LUT.get_unchecked(mask as usize);
2240        *dst = *src.add(*idx.get_unchecked(0) as usize);
2241        *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2242        *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2243        *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2244        *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2245        *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2246        *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2247        *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2248    }
2249}
2250
2251/// SSSE3 pshufb-based byte compaction. Loads 8 source bytes into an XMM register,
2252/// shuffles kept bytes to the front using COMPACT_LUT + _mm_shuffle_epi8, stores 8 bytes.
2253/// ~4x faster than scalar compact_8bytes: 1 pshufb vs 8 individual indexed byte copies.
2254/// Requires SSSE3; safe to call from AVX2 functions (which imply SSSE3).
2255#[cfg(target_arch = "x86_64")]
2256#[target_feature(enable = "ssse3")]
2257#[inline]
2258unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2259    use std::arch::x86_64::*;
2260    unsafe {
2261        let src_v = _mm_loadl_epi64(src as *const _);
2262        let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2263        let out_v = _mm_shuffle_epi8(src_v, shuf);
2264        _mm_storel_epi64(dst as *mut _, out_v);
2265    }
2266}
2267
2268#[cfg(target_arch = "x86_64")]
2269#[target_feature(enable = "sse2")]
2270unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2271    use std::arch::x86_64::*;
2272
2273    unsafe {
2274        let range = hi - lo;
2275        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2276        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2277        let zero = _mm_setzero_si128();
2278
2279        let len = src.len();
2280        let sp = src.as_ptr();
2281        let dp = dst.as_mut_ptr();
2282        let mut ri = 0;
2283        let mut wp = 0;
2284
2285        while ri + 16 <= len {
2286            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2287            let biased = _mm_add_epi8(input, bias_v);
2288            let gt = _mm_cmpgt_epi8(biased, threshold_v);
2289            let in_range = _mm_cmpeq_epi8(gt, zero);
2290            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2291
2292            if keep_mask == 0xFFFF {
2293                // All 16 bytes kept — bulk copy
2294                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2295                wp += 16;
2296            } else if keep_mask != 0 {
2297                let m0 = keep_mask as u8;
2298                let m1 = (keep_mask >> 8) as u8;
2299                if m0 == 0xFF {
2300                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2301                } else if m0 != 0 {
2302                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2303                }
2304                let c0 = m0.count_ones() as usize;
2305                if m1 == 0xFF {
2306                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2307                } else if m1 != 0 {
2308                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2309                }
2310                wp += c0 + m1.count_ones() as usize;
2311            }
2312            ri += 16;
2313        }
2314
2315        // Scalar tail — branchless
2316        while ri < len {
2317            let b = *sp.add(ri);
2318            *dp.add(wp) = b;
2319            wp += (b < lo || b > hi) as usize;
2320            ri += 1;
2321        }
2322
2323        wp
2324    }
2325}
2326
2327/// Branchless range delete fallback for non-x86_64 (ARM64, etc.).
2328/// Unconditional store + conditional pointer advance eliminates branch
2329/// mispredictions. Unrolled 8x for better ILP on out-of-order cores.
2330#[cfg(not(target_arch = "x86_64"))]
2331fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2332    let len = src.len();
2333    let sp = src.as_ptr();
2334    let dp = dst.as_mut_ptr();
2335    let mut wp: usize = 0;
2336    let mut i: usize = 0;
2337
2338    // Unrolled branchless loop — 8 bytes per iteration
2339    while i + 8 <= len {
2340        unsafe {
2341            let b0 = *sp.add(i);
2342            *dp.add(wp) = b0;
2343            wp += (b0 < lo || b0 > hi) as usize;
2344            let b1 = *sp.add(i + 1);
2345            *dp.add(wp) = b1;
2346            wp += (b1 < lo || b1 > hi) as usize;
2347            let b2 = *sp.add(i + 2);
2348            *dp.add(wp) = b2;
2349            wp += (b2 < lo || b2 > hi) as usize;
2350            let b3 = *sp.add(i + 3);
2351            *dp.add(wp) = b3;
2352            wp += (b3 < lo || b3 > hi) as usize;
2353            let b4 = *sp.add(i + 4);
2354            *dp.add(wp) = b4;
2355            wp += (b4 < lo || b4 > hi) as usize;
2356            let b5 = *sp.add(i + 5);
2357            *dp.add(wp) = b5;
2358            wp += (b5 < lo || b5 > hi) as usize;
2359            let b6 = *sp.add(i + 6);
2360            *dp.add(wp) = b6;
2361            wp += (b6 < lo || b6 > hi) as usize;
2362            let b7 = *sp.add(i + 7);
2363            *dp.add(wp) = b7;
2364            wp += (b7 < lo || b7 > hi) as usize;
2365        }
2366        i += 8;
2367    }
2368
2369    // Scalar tail
2370    while i < len {
2371        unsafe {
2372            let b = *sp.add(i);
2373            *dp.add(wp) = b;
2374            wp += (b < lo || b > hi) as usize;
2375        }
2376        i += 1;
2377    }
2378
2379    wp
2380}
2381
2382/// Streaming delete for contiguous byte ranges using SIMD range detection.
2383/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
2384/// When no bytes are deleted from a chunk (common for data with few matches),
2385/// writes directly from the source buffer to avoid the copy overhead.
2386fn delete_range_streaming(
2387    lo: u8,
2388    hi: u8,
2389    reader: &mut impl Read,
2390    writer: &mut impl Write,
2391) -> io::Result<()> {
2392    // Single-buffer in-place delete: eliminates the 16MB dst allocation
2393    // and its ~4000 page faults. For 10MB piped input, saves ~1.2ms.
2394    let mut buf = alloc_uninit_vec(STREAM_BUF);
2395    loop {
2396        let n = read_once(reader, &mut buf)?;
2397        if n == 0 {
2398            break;
2399        }
2400        let wp = delete_range_inplace(&mut buf, n, lo, hi);
2401        if wp > 0 {
2402            writer.write_all(&buf[..wp])?;
2403        }
2404    }
2405    Ok(())
2406}
2407
2408/// In-place range delete: SIMD scan for all-keep blocks + branchless scalar compaction.
2409/// Uses a single buffer — reads at position ri, writes at position wp (wp <= ri always).
2410#[inline]
2411fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2412    #[cfg(target_arch = "x86_64")]
2413    {
2414        let level = get_simd_level();
2415        if level >= 3 {
2416            return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2417        }
2418    }
2419    // Scalar fallback: branchless in-place delete
2420    let ptr = buf.as_mut_ptr();
2421    let mut ri = 0;
2422    let mut wp = 0;
2423    unsafe {
2424        while ri + 8 <= n {
2425            let b0 = *ptr.add(ri);
2426            let b1 = *ptr.add(ri + 1);
2427            let b2 = *ptr.add(ri + 2);
2428            let b3 = *ptr.add(ri + 3);
2429            let b4 = *ptr.add(ri + 4);
2430            let b5 = *ptr.add(ri + 5);
2431            let b6 = *ptr.add(ri + 6);
2432            let b7 = *ptr.add(ri + 7);
2433            *ptr.add(wp) = b0;
2434            wp += (b0 < lo || b0 > hi) as usize;
2435            *ptr.add(wp) = b1;
2436            wp += (b1 < lo || b1 > hi) as usize;
2437            *ptr.add(wp) = b2;
2438            wp += (b2 < lo || b2 > hi) as usize;
2439            *ptr.add(wp) = b3;
2440            wp += (b3 < lo || b3 > hi) as usize;
2441            *ptr.add(wp) = b4;
2442            wp += (b4 < lo || b4 > hi) as usize;
2443            *ptr.add(wp) = b5;
2444            wp += (b5 < lo || b5 > hi) as usize;
2445            *ptr.add(wp) = b6;
2446            wp += (b6 < lo || b6 > hi) as usize;
2447            *ptr.add(wp) = b7;
2448            wp += (b7 < lo || b7 > hi) as usize;
2449            ri += 8;
2450        }
2451        while ri < n {
2452            let b = *ptr.add(ri);
2453            *ptr.add(wp) = b;
2454            wp += (b < lo || b > hi) as usize;
2455            ri += 1;
2456        }
2457    }
2458    wp
2459}
2460
2461/// AVX2 in-place range delete: scan 32 bytes at a time, skip all-keep blocks,
2462/// branchless scalar compaction for mixed blocks.
2463#[cfg(target_arch = "x86_64")]
2464#[target_feature(enable = "avx2")]
2465unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2466    use std::arch::x86_64::*;
2467
2468    unsafe {
2469        let range = hi - lo;
2470        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2471        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2472        let zero = _mm256_setzero_si256();
2473
2474        let ptr = buf.as_mut_ptr();
2475        let mut ri = 0;
2476        let mut wp = 0;
2477
2478        while ri + 32 <= n {
2479            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2480            let biased = _mm256_add_epi8(input, bias_v);
2481            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2482            let in_range = _mm256_cmpeq_epi8(gt, zero);
2483            let del_mask = _mm256_movemask_epi8(in_range) as u32;
2484
2485            if del_mask == 0 {
2486                // All 32 bytes kept
2487                if wp != ri {
2488                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2489                }
2490                wp += 32;
2491            } else if del_mask != 0xFFFFFFFF {
2492                // Mixed block: pshufb-based 8-byte compaction.
2493                // Process 4 × 8-byte sub-chunks using COMPACT_LUT + pshufb.
2494                // Each sub-chunk: load 8 bytes into register (safe for overlap),
2495                // shuffle kept bytes to front, store. 4 SIMD ops vs 32 scalar.
2496                let keep_mask = !del_mask;
2497                let m0 = keep_mask as u8;
2498                let m1 = (keep_mask >> 8) as u8;
2499                let m2 = (keep_mask >> 16) as u8;
2500                let m3 = (keep_mask >> 24) as u8;
2501
2502                let c0 = m0.count_ones() as usize;
2503                let c1 = m1.count_ones() as usize;
2504                let c2 = m2.count_ones() as usize;
2505                let c3 = m3.count_ones() as usize;
2506
2507                // Sub-chunk 0: bytes 0-7
2508                if m0 == 0xFF {
2509                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2510                } else if m0 != 0 {
2511                    let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2512                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2513                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2514                    _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2515                }
2516
2517                // Sub-chunk 1: bytes 8-15
2518                if m1 == 0xFF {
2519                    std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2520                } else if m1 != 0 {
2521                    let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2522                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2523                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2524                    _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2525                }
2526
2527                // Sub-chunk 2: bytes 16-23
2528                if m2 == 0xFF {
2529                    std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2530                } else if m2 != 0 {
2531                    let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2532                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2533                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2534                    _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2535                }
2536
2537                // Sub-chunk 3: bytes 24-31
2538                if m3 == 0xFF {
2539                    std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2540                } else if m3 != 0 {
2541                    let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2542                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2543                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2544                    _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2545                }
2546
2547                wp += c0 + c1 + c2 + c3;
2548            }
2549            // del_mask == 0xFFFFFFFF: all deleted, skip entirely
2550            ri += 32;
2551        }
2552
2553        // Scalar tail
2554        while ri < n {
2555            let b = *ptr.add(ri);
2556            *ptr.add(wp) = b;
2557            wp += (b < lo || b > hi) as usize;
2558            ri += 1;
2559        }
2560
2561        wp
2562    }
2563}
2564
2565// ============================================================================
2566// Streaming functions (Read + Write)
2567// ============================================================================
2568
2569pub fn translate(
2570    set1: &[u8],
2571    set2: &[u8],
2572    reader: &mut impl Read,
2573    writer: &mut impl Write,
2574) -> io::Result<()> {
2575    let table = build_translate_table(set1, set2);
2576
2577    // Check for identity table — pure passthrough (no transformation needed)
2578    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2579    if is_identity {
2580        return passthrough_stream(reader, writer);
2581    }
2582
2583    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
2584    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2585        return translate_range_stream(lo, hi, offset, reader, writer);
2586    }
2587
2588    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
2589    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
2590    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2591        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2592    }
2593
2594    // General case: IN-PLACE translation on a SINGLE 16MB buffer.
2595    // This halves memory bandwidth vs the old separate src/dst approach:
2596    // - Old: read into src, translate from src→dst (read + write), write dst = 12MB bandwidth
2597    // - New: read into buf, translate in-place (read+write), write buf = 8MB bandwidth
2598    // The 8x-unrolled in-place translate avoids store-to-load forwarding stalls
2599    // because consecutive reads are 8 bytes apart (sequential), not aliased.
2600    // Using 16MB buffer = 1 read for 10MB input, minimizing syscall count.
2601    // SAFETY: all bytes are written by read_once before being translated.
2602    let mut buf = alloc_uninit_vec(STREAM_BUF);
2603    loop {
2604        let n = read_once(reader, &mut buf)?;
2605        if n == 0 {
2606            break;
2607        }
2608        if n >= PARALLEL_THRESHOLD {
2609            // Parallel in-place translate for large chunks (~4x faster on 4+ cores).
2610            let nt = rayon::current_num_threads().max(1);
2611            let cs = (n / nt).max(32 * 1024);
2612            buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2613                translate_inplace(chunk, &table);
2614            });
2615        } else {
2616            translate_inplace(&mut buf[..n], &table);
2617        }
2618        writer.write_all(&buf[..n])?;
2619    }
2620    Ok(())
2621}
2622
2623/// Streaming SIMD range translation — single buffer, in-place transform.
2624/// Uses 16MB uninit buffer for fewer syscalls (translate is compute-light).
2625/// For chunks >= PARALLEL_THRESHOLD, uses rayon par_chunks_mut for multi-core.
2626fn translate_range_stream(
2627    lo: u8,
2628    hi: u8,
2629    offset: i8,
2630    reader: &mut impl Read,
2631    writer: &mut impl Write,
2632) -> io::Result<()> {
2633    let mut buf = alloc_uninit_vec(STREAM_BUF);
2634    loop {
2635        let n = read_once(reader, &mut buf)?;
2636        if n == 0 {
2637            break;
2638        }
2639        if n >= PARALLEL_THRESHOLD {
2640            let nt = rayon::current_num_threads().max(1);
2641            let cs = (n / nt).max(32 * 1024);
2642            buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2643                translate_range_simd_inplace(chunk, lo, hi, offset);
2644            });
2645        } else {
2646            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2647        }
2648        writer.write_all(&buf[..n])?;
2649    }
2650    Ok(())
2651}
2652
2653/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
2654/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
2655/// For chunks >= PARALLEL_THRESHOLD, uses rayon par_chunks_mut for multi-core.
2656fn translate_range_to_constant_stream(
2657    lo: u8,
2658    hi: u8,
2659    replacement: u8,
2660    reader: &mut impl Read,
2661    writer: &mut impl Write,
2662) -> io::Result<()> {
2663    let mut buf = alloc_uninit_vec(STREAM_BUF);
2664    loop {
2665        let n = read_once(reader, &mut buf)?;
2666        if n == 0 {
2667            break;
2668        }
2669        if n >= PARALLEL_THRESHOLD {
2670            let nt = rayon::current_num_threads().max(1);
2671            let cs = (n / nt).max(32 * 1024);
2672            buf[..n].par_chunks_mut(cs).for_each(|chunk| {
2673                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
2674            });
2675        } else {
2676            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2677        }
2678        writer.write_all(&buf[..n])?;
2679    }
2680    Ok(())
2681}
2682
2683/// Pure passthrough: copy stdin to stdout without transformation.
2684/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
2685fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2686    let mut buf = alloc_uninit_vec(STREAM_BUF);
2687    loop {
2688        let n = read_once(reader, &mut buf)?;
2689        if n == 0 {
2690            break;
2691        }
2692        writer.write_all(&buf[..n])?;
2693    }
2694    Ok(())
2695}
2696
2697/// Single-read for pipelining: process data immediately after first read()
2698/// instead of blocking to fill the entire buffer. This enables cat|ftr
2699/// pipelining: while ftr processes the first chunk, cat continues writing
2700/// to the pipe. For 10MB piped input with 8MB pipe buffer, this saves
2701/// ~0.5-1ms by overlapping cat's final writes with ftr's processing.
2702#[inline]
2703fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2704    loop {
2705        match reader.read(buf) {
2706            Ok(n) => return Ok(n),
2707            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2708            Err(e) => return Err(e),
2709        }
2710    }
2711}
2712
2713pub fn translate_squeeze(
2714    set1: &[u8],
2715    set2: &[u8],
2716    reader: &mut impl Read,
2717    writer: &mut impl Write,
2718) -> io::Result<()> {
2719    let table = build_translate_table(set1, set2);
2720    let squeeze_set = build_member_set(set2);
2721
2722    // For single-char squeeze set with range-to-constant translation, use
2723    // fused approach: translate via SIMD, then use memmem to find squeeze points.
2724    if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2725        let squeeze_ch = set2.last().copied().unwrap_or(0);
2726        return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2727    }
2728
2729    // Two-pass optimization for range translations:
2730    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
2731    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
2732    let range_info = detect_range_offset(&table);
2733    let range_const_info = if range_info.is_none() {
2734        detect_range_to_constant(&table)
2735    } else {
2736        None
2737    };
2738
2739    let mut buf = alloc_uninit_vec(STREAM_BUF);
2740    let mut last_squeezed: u16 = 256;
2741
2742    loop {
2743        let n = read_once(reader, &mut buf)?;
2744        if n == 0 {
2745            break;
2746        }
2747        // Pass 1: translate
2748        if let Some((lo, hi, offset)) = range_info {
2749            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2750        } else if let Some((lo, hi, replacement)) = range_const_info {
2751            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2752        } else {
2753            translate_inplace(&mut buf[..n], &table);
2754        }
2755        // Pass 2: squeeze in-place using 8x-unrolled loop
2756        let mut wp = 0;
2757        unsafe {
2758            let ptr = buf.as_mut_ptr();
2759            let mut i = 0;
2760            while i + 8 <= n {
2761                macro_rules! squeeze_byte {
2762                    ($off:expr) => {
2763                        let b = *ptr.add(i + $off);
2764                        if is_member(&squeeze_set, b) {
2765                            if last_squeezed != b as u16 {
2766                                last_squeezed = b as u16;
2767                                *ptr.add(wp) = b;
2768                                wp += 1;
2769                            }
2770                        } else {
2771                            last_squeezed = 256;
2772                            *ptr.add(wp) = b;
2773                            wp += 1;
2774                        }
2775                    };
2776                }
2777                squeeze_byte!(0);
2778                squeeze_byte!(1);
2779                squeeze_byte!(2);
2780                squeeze_byte!(3);
2781                squeeze_byte!(4);
2782                squeeze_byte!(5);
2783                squeeze_byte!(6);
2784                squeeze_byte!(7);
2785                i += 8;
2786            }
2787            while i < n {
2788                let b = *ptr.add(i);
2789                if is_member(&squeeze_set, b) {
2790                    if last_squeezed == b as u16 {
2791                        i += 1;
2792                        continue;
2793                    }
2794                    last_squeezed = b as u16;
2795                } else {
2796                    last_squeezed = 256;
2797                }
2798                *ptr.add(wp) = b;
2799                wp += 1;
2800                i += 1;
2801            }
2802        }
2803        writer.write_all(&buf[..wp])?;
2804    }
2805    Ok(())
2806}
2807
2808/// Optimized translate+squeeze for single squeeze character.
2809/// After SIMD translation, uses memmem to find consecutive pairs
2810/// and compacts in-place with a single write_all per chunk.
2811fn translate_squeeze_single_ch(
2812    table: &[u8; 256],
2813    squeeze_ch: u8,
2814    _squeeze_set: &[u8; 32],
2815    reader: &mut impl Read,
2816    writer: &mut impl Write,
2817) -> io::Result<()> {
2818    let range_info = detect_range_offset(table);
2819    let range_const_info = if range_info.is_none() {
2820        detect_range_to_constant(table)
2821    } else {
2822        None
2823    };
2824
2825    let pair = [squeeze_ch, squeeze_ch];
2826    let finder = memchr::memmem::Finder::new(&pair);
2827    let mut buf = alloc_uninit_vec(STREAM_BUF);
2828    let mut was_squeeze_char = false;
2829
2830    loop {
2831        let n = read_once(reader, &mut buf)?;
2832        if n == 0 {
2833            break;
2834        }
2835        // Pass 1: SIMD translate in-place
2836        if let Some((lo, hi, offset)) = range_info {
2837            translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2838        } else if let Some((lo, hi, replacement)) = range_const_info {
2839            translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2840        } else {
2841            translate_inplace(&mut buf[..n], table);
2842        }
2843
2844        // Pass 2: in-place squeeze compaction
2845        let mut i = 0;
2846
2847        // Handle carry-over from previous chunk
2848        if was_squeeze_char {
2849            while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2850                i += 1;
2851            }
2852            if i >= n {
2853                continue;
2854            }
2855        }
2856
2857        let ptr = buf.as_mut_ptr();
2858        let mut wp = 0usize;
2859
2860        loop {
2861            match finder.find(&buf[i..n]) {
2862                Some(offset) => {
2863                    let seg_end = i + offset + 1;
2864                    let gap = seg_end - i;
2865                    if gap > 0 {
2866                        if wp != i {
2867                            unsafe {
2868                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2869                            }
2870                        }
2871                        wp += gap;
2872                    }
2873                    i = seg_end;
2874                    while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2875                        i += 1;
2876                    }
2877                    if i >= n {
2878                        was_squeeze_char = true;
2879                        break;
2880                    }
2881                }
2882                None => {
2883                    let rem = n - i;
2884                    if rem > 0 {
2885                        if wp != i {
2886                            unsafe {
2887                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2888                            }
2889                        }
2890                        wp += rem;
2891                    }
2892                    was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2893                    break;
2894                }
2895            }
2896        }
2897
2898        if wp > 0 {
2899            writer.write_all(&buf[..wp])?;
2900        }
2901    }
2902    Ok(())
2903}
2904
2905pub fn delete(
2906    delete_chars: &[u8],
2907    reader: &mut impl Read,
2908    writer: &mut impl Write,
2909) -> io::Result<()> {
2910    if delete_chars.len() == 1 {
2911        return delete_single_streaming(delete_chars[0], reader, writer);
2912    }
2913    if delete_chars.len() <= 3 {
2914        return delete_multi_streaming(delete_chars, reader, writer);
2915    }
2916
2917    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
2918    // use vectorized range comparison instead of scalar bitset lookup.
2919    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
2920    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2921        return delete_range_streaming(lo, hi, reader, writer);
2922    }
2923
2924    let member = build_member_set(delete_chars);
2925    let mut buf = alloc_uninit_vec(STREAM_BUF);
2926    // Separate output buffer for SIMD compaction — keeps source data intact
2927    // while compact_8bytes_simd writes to a different location.
2928    let mut outbuf = alloc_uninit_vec(STREAM_BUF);
2929
2930    loop {
2931        let n = read_once(reader, &mut buf)?;
2932        if n == 0 {
2933            break;
2934        }
2935        #[cfg(target_arch = "x86_64")]
2936        let wp = if get_simd_level() >= 3 {
2937            unsafe { delete_bitset_avx2_stream(&buf[..n], &mut outbuf, &member) }
2938        } else {
2939            delete_bitset_scalar(&buf[..n], &mut outbuf, &member)
2940        };
2941        #[cfg(not(target_arch = "x86_64"))]
2942        let wp = delete_bitset_scalar(&buf[..n], &mut outbuf, &member);
2943        if wp > 0 {
2944            writer.write_all(&outbuf[..wp])?;
2945        }
2946    }
2947    Ok(())
2948}
2949
2950/// Scalar bitset delete: write kept bytes to output buffer.
2951#[inline]
2952fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
2953    let n = src.len();
2954    let mut wp = 0;
2955    unsafe {
2956        let sp = src.as_ptr();
2957        let dp = dst.as_mut_ptr();
2958        let mut i = 0;
2959        while i + 8 <= n {
2960            let b0 = *sp.add(i);
2961            let b1 = *sp.add(i + 1);
2962            let b2 = *sp.add(i + 2);
2963            let b3 = *sp.add(i + 3);
2964            let b4 = *sp.add(i + 4);
2965            let b5 = *sp.add(i + 5);
2966            let b6 = *sp.add(i + 6);
2967            let b7 = *sp.add(i + 7);
2968            *dp.add(wp) = b0;
2969            wp += !is_member(member, b0) as usize;
2970            *dp.add(wp) = b1;
2971            wp += !is_member(member, b1) as usize;
2972            *dp.add(wp) = b2;
2973            wp += !is_member(member, b2) as usize;
2974            *dp.add(wp) = b3;
2975            wp += !is_member(member, b3) as usize;
2976            *dp.add(wp) = b4;
2977            wp += !is_member(member, b4) as usize;
2978            *dp.add(wp) = b5;
2979            wp += !is_member(member, b5) as usize;
2980            *dp.add(wp) = b6;
2981            wp += !is_member(member, b6) as usize;
2982            *dp.add(wp) = b7;
2983            wp += !is_member(member, b7) as usize;
2984            i += 8;
2985        }
2986        while i < n {
2987            let b = *sp.add(i);
2988            *dp.add(wp) = b;
2989            wp += !is_member(member, b) as usize;
2990            i += 1;
2991        }
2992    }
2993    wp
2994}
2995
2996/// AVX2 bitset delete for streaming: uses SIMD to check 32 bytes against the
2997/// membership bitset at once, then compact_8bytes_simd to pack kept bytes.
2998#[cfg(target_arch = "x86_64")]
2999#[target_feature(enable = "avx2")]
3000unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3001    use std::arch::x86_64::*;
3002
3003    unsafe {
3004        let n = src.len();
3005        let sp = src.as_ptr();
3006        let dp = dst.as_mut_ptr();
3007        let mut ri = 0;
3008        let mut wp = 0;
3009
3010        // Load the 256-bit membership bitset into an AVX2 register.
3011        // Byte i of member_v has bits set for characters in [i*8..i*8+7].
3012        let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3013
3014        // For each input byte B, we check: member[B >> 3] & (1 << (B & 7))
3015        // Using SIMD: extract byte index (B >> 3) and bit position (B & 7).
3016        let mask7 = _mm256_set1_epi8(7);
3017        let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3018
3019        // Lookup table for (1 << (x & 7)) — pshufb gives per-byte shift
3020        // that _mm256_sllv_epi32 can't do (it works on 32-bit lanes).
3021        let bit_table = _mm256_setr_epi8(
3022            1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3023            0, 0, 0, 0, 0, 0, 0, 0,
3024        );
3025
3026        while ri + 32 <= n {
3027            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3028
3029            // byte_idx = input >> 3 (which byte of the 32-byte member set)
3030            let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3031            // bit_pos = input & 7 (which bit within that byte)
3032            let bit_pos = _mm256_and_si256(input, mask7);
3033            // bit_mask = 1 << bit_pos (per-byte via shuffle lookup)
3034            let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3035
3036            // member_byte = shuffle member_v by byte_idx (pshufb)
3037            // But pshufb only works within 128-bit lanes. We need cross-lane.
3038            // Since member is 32 bytes and byte_idx can be 0-31, we need
3039            // a different approach. Use two pshufb + blend:
3040            // lo_half = pshufb(member[0..15], byte_idx)
3041            // hi_half = pshufb(member[16..31], byte_idx - 16)
3042            // select = byte_idx >= 16
3043            let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3044            let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3045            let lo_mask = _mm256_set1_epi8(0x0F);
3046            let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3047            let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3048            let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3049            // select hi when byte_idx >= 16 (bit 4 set)
3050            let use_hi = _mm256_slli_epi16(byte_idx, 3); // shift bit 4 to bit 7
3051            let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3052
3053            // Check: (member_byte & bit_mask) != 0 → byte is in delete set
3054            let test = _mm256_and_si256(member_byte, bit_mask);
3055            let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3056            // keep_mask: bit set = byte should be KEPT (not in delete set)
3057            let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3058
3059            if keep_mask == 0xFFFFFFFF {
3060                // All 32 bytes kept — bulk copy
3061                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3062                wp += 32;
3063            } else if keep_mask != 0 {
3064                // Partial keep — compact 8 bytes at a time
3065                let m0 = keep_mask as u8;
3066                let m1 = (keep_mask >> 8) as u8;
3067                let m2 = (keep_mask >> 16) as u8;
3068                let m3 = (keep_mask >> 24) as u8;
3069
3070                if m0 == 0xFF {
3071                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3072                } else if m0 != 0 {
3073                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3074                }
3075                let c0 = m0.count_ones() as usize;
3076
3077                if m1 == 0xFF {
3078                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3079                } else if m1 != 0 {
3080                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3081                }
3082                let c1 = m1.count_ones() as usize;
3083
3084                if m2 == 0xFF {
3085                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3086                } else if m2 != 0 {
3087                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3088                }
3089                let c2 = m2.count_ones() as usize;
3090
3091                if m3 == 0xFF {
3092                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3093                } else if m3 != 0 {
3094                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3095                }
3096                let c3 = m3.count_ones() as usize;
3097                wp += c0 + c1 + c2 + c3;
3098            }
3099            // else: all 32 bytes deleted, wp unchanged
3100            ri += 32;
3101        }
3102
3103        // Scalar tail
3104        while ri < n {
3105            let b = *sp.add(ri);
3106            *dp.add(wp) = b;
3107            wp += !is_member(member, b) as usize;
3108            ri += 1;
3109        }
3110
3111        wp
3112    }
3113}
3114
3115fn delete_single_streaming(
3116    ch: u8,
3117    reader: &mut impl Read,
3118    writer: &mut impl Write,
3119) -> io::Result<()> {
3120    // Single-buffer in-place delete: memchr finds delete positions,
3121    // gap-copy backward in the same buffer. Saves 16MB dst allocation.
3122    let mut buf = alloc_uninit_vec(STREAM_BUF);
3123    loop {
3124        let n = read_once(reader, &mut buf)?;
3125        if n == 0 {
3126            break;
3127        }
3128        let mut wp = 0;
3129        let mut i = 0;
3130        while i < n {
3131            match memchr::memchr(ch, &buf[i..n]) {
3132                Some(offset) => {
3133                    if offset > 0 {
3134                        if wp != i {
3135                            unsafe {
3136                                std::ptr::copy(
3137                                    buf.as_ptr().add(i),
3138                                    buf.as_mut_ptr().add(wp),
3139                                    offset,
3140                                );
3141                            }
3142                        }
3143                        wp += offset;
3144                    }
3145                    i += offset + 1;
3146                }
3147                None => {
3148                    let run_len = n - i;
3149                    if run_len > 0 {
3150                        if wp != i {
3151                            unsafe {
3152                                std::ptr::copy(
3153                                    buf.as_ptr().add(i),
3154                                    buf.as_mut_ptr().add(wp),
3155                                    run_len,
3156                                );
3157                            }
3158                        }
3159                        wp += run_len;
3160                    }
3161                    break;
3162                }
3163            }
3164        }
3165        if wp > 0 {
3166            writer.write_all(&buf[..wp])?;
3167        }
3168    }
3169    Ok(())
3170}
3171
3172fn delete_multi_streaming(
3173    chars: &[u8],
3174    reader: &mut impl Read,
3175    writer: &mut impl Write,
3176) -> io::Result<()> {
3177    // Single-buffer in-place delete: memchr2/memchr3 finds delete positions,
3178    // gap-copy backward in the same buffer. Saves 16MB dst allocation.
3179    let mut buf = alloc_uninit_vec(STREAM_BUF);
3180    loop {
3181        let n = read_once(reader, &mut buf)?;
3182        if n == 0 {
3183            break;
3184        }
3185        let mut wp = 0;
3186        let mut i = 0;
3187        while i < n {
3188            let found = if chars.len() == 2 {
3189                memchr::memchr2(chars[0], chars[1], &buf[i..n])
3190            } else {
3191                memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3192            };
3193            match found {
3194                Some(offset) => {
3195                    if offset > 0 {
3196                        if wp != i {
3197                            unsafe {
3198                                std::ptr::copy(
3199                                    buf.as_ptr().add(i),
3200                                    buf.as_mut_ptr().add(wp),
3201                                    offset,
3202                                );
3203                            }
3204                        }
3205                        wp += offset;
3206                    }
3207                    i += offset + 1;
3208                }
3209                None => {
3210                    let run_len = n - i;
3211                    if run_len > 0 {
3212                        if wp != i {
3213                            unsafe {
3214                                std::ptr::copy(
3215                                    buf.as_ptr().add(i),
3216                                    buf.as_mut_ptr().add(wp),
3217                                    run_len,
3218                                );
3219                            }
3220                        }
3221                        wp += run_len;
3222                    }
3223                    break;
3224                }
3225            }
3226        }
3227        if wp > 0 {
3228            writer.write_all(&buf[..wp])?;
3229        }
3230    }
3231    Ok(())
3232}
3233
3234pub fn delete_squeeze(
3235    delete_chars: &[u8],
3236    squeeze_chars: &[u8],
3237    reader: &mut impl Read,
3238    writer: &mut impl Write,
3239) -> io::Result<()> {
3240    let delete_set = build_member_set(delete_chars);
3241    let squeeze_set = build_member_set(squeeze_chars);
3242    let mut buf = alloc_uninit_vec(STREAM_BUF);
3243    let mut last_squeezed: u16 = 256;
3244
3245    loop {
3246        let n = read_once(reader, &mut buf)?;
3247        if n == 0 {
3248            break;
3249        }
3250        // Fused delete+squeeze: 8x-unrolled inner loop for better ILP.
3251        // Each byte is checked against delete set first (skip if member),
3252        // then squeeze set (deduplicate consecutive members).
3253        let mut wp = 0;
3254        unsafe {
3255            let ptr = buf.as_mut_ptr();
3256            let mut i = 0;
3257            while i + 8 <= n {
3258                macro_rules! process_byte {
3259                    ($off:expr) => {
3260                        let b = *ptr.add(i + $off);
3261                        if !is_member(&delete_set, b) {
3262                            if is_member(&squeeze_set, b) {
3263                                if last_squeezed != b as u16 {
3264                                    last_squeezed = b as u16;
3265                                    *ptr.add(wp) = b;
3266                                    wp += 1;
3267                                }
3268                            } else {
3269                                last_squeezed = 256;
3270                                *ptr.add(wp) = b;
3271                                wp += 1;
3272                            }
3273                        }
3274                    };
3275                }
3276                process_byte!(0);
3277                process_byte!(1);
3278                process_byte!(2);
3279                process_byte!(3);
3280                process_byte!(4);
3281                process_byte!(5);
3282                process_byte!(6);
3283                process_byte!(7);
3284                i += 8;
3285            }
3286            while i < n {
3287                let b = *ptr.add(i);
3288                if !is_member(&delete_set, b) {
3289                    if is_member(&squeeze_set, b) {
3290                        if last_squeezed != b as u16 {
3291                            last_squeezed = b as u16;
3292                            *ptr.add(wp) = b;
3293                            wp += 1;
3294                        }
3295                    } else {
3296                        last_squeezed = 256;
3297                        *ptr.add(wp) = b;
3298                        wp += 1;
3299                    }
3300                }
3301                i += 1;
3302            }
3303        }
3304        writer.write_all(&buf[..wp])?;
3305    }
3306    Ok(())
3307}
3308
3309pub fn squeeze(
3310    squeeze_chars: &[u8],
3311    reader: &mut impl Read,
3312    writer: &mut impl Write,
3313) -> io::Result<()> {
3314    if squeeze_chars.len() == 1 {
3315        return squeeze_single_stream(squeeze_chars[0], reader, writer);
3316    }
3317
3318    // For 2-3 squeeze chars, use memchr2/memchr3-based gap-copy
3319    // which gives SIMD-accelerated scanning instead of byte-at-a-time.
3320    if squeeze_chars.len() <= 3 {
3321        return squeeze_multi_stream(squeeze_chars, reader, writer);
3322    }
3323
3324    let member = build_member_set(squeeze_chars);
3325    let mut buf = alloc_uninit_vec(STREAM_BUF);
3326    let mut last_squeezed: u16 = 256;
3327
3328    loop {
3329        let n = read_once(reader, &mut buf)?;
3330        if n == 0 {
3331            break;
3332        }
3333        let mut wp = 0;
3334        unsafe {
3335            let ptr = buf.as_mut_ptr();
3336            for i in 0..n {
3337                let b = *ptr.add(i);
3338                if is_member(&member, b) {
3339                    if last_squeezed == b as u16 {
3340                        continue;
3341                    }
3342                    last_squeezed = b as u16;
3343                } else {
3344                    last_squeezed = 256;
3345                }
3346                *ptr.add(wp) = b;
3347                wp += 1;
3348            }
3349        }
3350        writer.write_all(&buf[..wp])?;
3351    }
3352    Ok(())
3353}
3354
3355/// Streaming squeeze for 2-3 chars using memchr2/memchr3 SIMD scanning.
3356/// Builds writev IoSlice entries pointing into the read buffer, skipping
3357/// duplicate runs of squeezable characters. Zero-copy between squeeze points.
3358fn squeeze_multi_stream(
3359    chars: &[u8],
3360    reader: &mut impl Read,
3361    writer: &mut impl Write,
3362) -> io::Result<()> {
3363    let c0 = chars[0];
3364    let c1 = chars[1];
3365    let c2 = if chars.len() >= 3 {
3366        Some(chars[2])
3367    } else {
3368        None
3369    };
3370    let single_byte = [0u8; 1]; // used for the kept single byte
3371    let _ = single_byte;
3372
3373    let mut buf = alloc_uninit_vec(STREAM_BUF);
3374    let mut last_squeezed: u16 = 256;
3375
3376    loop {
3377        let n = read_once(reader, &mut buf)?;
3378        if n == 0 {
3379            break;
3380        }
3381
3382        // In-place compaction using memchr2/memchr3 gap-copy.
3383        // For each squeezable char found, copy the gap before it,
3384        // then emit one byte (if not a squeeze duplicate) and skip the run.
3385        let ptr = buf.as_mut_ptr();
3386        let mut wp = 0usize;
3387        let mut cursor = 0usize;
3388
3389        macro_rules! find_next {
3390            ($start:expr) => {
3391                if let Some(c) = c2 {
3392                    memchr::memchr3(c0, c1, c, &buf[$start..n])
3393                } else {
3394                    memchr::memchr2(c0, c1, &buf[$start..n])
3395                }
3396            };
3397        }
3398
3399        while cursor < n {
3400            match find_next!(cursor) {
3401                Some(offset) => {
3402                    let pos = cursor + offset;
3403                    let b = unsafe { *ptr.add(pos) };
3404
3405                    // Copy gap before squeeze point
3406                    let gap = pos - cursor;
3407                    if gap > 0 {
3408                        if wp != cursor {
3409                            unsafe {
3410                                std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3411                            }
3412                        }
3413                        wp += gap;
3414                        last_squeezed = 256;
3415                    }
3416
3417                    // Emit single byte if not duplicate
3418                    if last_squeezed != b as u16 {
3419                        unsafe { *ptr.add(wp) = b };
3420                        wp += 1;
3421                        last_squeezed = b as u16;
3422                    }
3423
3424                    // Skip the run of same byte
3425                    cursor = pos + 1;
3426                    while cursor < n && unsafe { *ptr.add(cursor) } == b {
3427                        cursor += 1;
3428                    }
3429                }
3430                None => {
3431                    // No more squeeze chars — copy remainder
3432                    let rem = n - cursor;
3433                    if rem > 0 {
3434                        if wp != cursor {
3435                            unsafe {
3436                                std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3437                            }
3438                        }
3439                        wp += rem;
3440                        last_squeezed = 256;
3441                    }
3442                    break;
3443                }
3444            }
3445        }
3446
3447        writer.write_all(&buf[..wp])?;
3448    }
3449    Ok(())
3450}
3451
3452fn squeeze_single_stream(
3453    ch: u8,
3454    reader: &mut impl Read,
3455    writer: &mut impl Write,
3456) -> io::Result<()> {
3457    // In-place compaction: memmem finds consecutive pairs, then gap-copy
3458    // in the same buffer to remove duplicates. Single write_all per chunk
3459    // eliminates writev overhead (saves ~5-10 syscalls for 10MB input).
3460    let pair = [ch, ch];
3461    let finder = memchr::memmem::Finder::new(&pair);
3462    let mut buf = alloc_uninit_vec(STREAM_BUF);
3463    let mut was_squeeze_char = false;
3464
3465    loop {
3466        let n = read_once(reader, &mut buf)?;
3467        if n == 0 {
3468            break;
3469        }
3470
3471        let mut i = 0;
3472
3473        // Handle carry-over: if previous chunk ended with squeeze char,
3474        // skip leading occurrences of that char in this chunk.
3475        if was_squeeze_char {
3476            while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3477                i += 1;
3478            }
3479            if i >= n {
3480                continue;
3481            }
3482        }
3483
3484        // In-place compaction: scan for consecutive pairs and remove duplicates.
3485        let ptr = buf.as_mut_ptr();
3486        let mut wp = 0usize;
3487
3488        loop {
3489            match finder.find(&buf[i..n]) {
3490                Some(offset) => {
3491                    // Copy everything up to and including the first char of the pair
3492                    let seg_end = i + offset + 1;
3493                    let gap = seg_end - i;
3494                    if gap > 0 {
3495                        if wp != i {
3496                            unsafe {
3497                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3498                            }
3499                        }
3500                        wp += gap;
3501                    }
3502                    i = seg_end;
3503                    // Skip all remaining consecutive ch bytes (the run)
3504                    while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3505                        i += 1;
3506                    }
3507                    if i >= n {
3508                        was_squeeze_char = true;
3509                        break;
3510                    }
3511                }
3512                None => {
3513                    // No more consecutive pairs — copy remainder
3514                    let rem = n - i;
3515                    if rem > 0 {
3516                        if wp != i {
3517                            unsafe {
3518                                std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3519                            }
3520                        }
3521                        wp += rem;
3522                    }
3523                    was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3524                    break;
3525                }
3526            }
3527        }
3528
3529        if wp > 0 {
3530            writer.write_all(&buf[..wp])?;
3531        }
3532    }
3533    Ok(())
3534}
3535
3536// ============================================================================
3537// Batch in-place functions (owned data from piped stdin)
3538// ============================================================================
3539
3540/// Translate bytes in-place on an owned buffer, then write.
3541/// For piped stdin where we own the data, this avoids the separate output buffer
3542/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
3543pub fn translate_owned(
3544    set1: &[u8],
3545    set2: &[u8],
3546    data: &mut [u8],
3547    writer: &mut impl Write,
3548) -> io::Result<()> {
3549    let table = build_translate_table(set1, set2);
3550
3551    // Identity table — pure passthrough
3552    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3553    if is_identity {
3554        return writer.write_all(data);
3555    }
3556
3557    // SIMD range fast path (in-place)
3558    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3559        if data.len() >= PARALLEL_THRESHOLD {
3560            let n_threads = rayon::current_num_threads().max(1);
3561            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3562            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3563                translate_range_simd_inplace(chunk, lo, hi, offset);
3564            });
3565        } else {
3566            translate_range_simd_inplace(data, lo, hi, offset);
3567        }
3568        return writer.write_all(data);
3569    }
3570
3571    // SIMD range-to-constant fast path (in-place)
3572    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3573        if data.len() >= PARALLEL_THRESHOLD {
3574            let n_threads = rayon::current_num_threads().max(1);
3575            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3576            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3577                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement);
3578            });
3579        } else {
3580            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3581        }
3582        return writer.write_all(data);
3583    }
3584
3585    // General table lookup (in-place)
3586    if data.len() >= PARALLEL_THRESHOLD {
3587        let n_threads = rayon::current_num_threads().max(1);
3588        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3589        data.par_chunks_mut(chunk_size).for_each(|chunk| {
3590            translate_inplace(chunk, &table);
3591        });
3592    } else {
3593        translate_inplace(data, &table);
3594    }
3595    writer.write_all(data)
3596}
3597
3598// ============================================================================
3599// Mmap-based functions (zero-copy input from byte slice)
3600// ============================================================================
3601
3602/// Maximum data size for single-allocation translate approach.
3603/// Translate bytes from an mmap'd byte slice.
3604/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
3605/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
3606/// Falls back to scalar 256-byte table lookup for general translations.
3607///
3608/// For data >= 2MB: uses rayon parallel processing across multiple cores.
3609/// For data <= 16MB: single allocation + single write_all (1 syscall).
3610/// For data > 16MB: chunked approach to limit memory (N syscalls where N = data/4MB).
3611pub fn translate_mmap(
3612    set1: &[u8],
3613    set2: &[u8],
3614    data: &[u8],
3615    writer: &mut impl Write,
3616) -> io::Result<()> {
3617    let table = build_translate_table(set1, set2);
3618
3619    // Check if table is identity — pure passthrough
3620    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3621    if is_identity {
3622        return writer.write_all(data);
3623    }
3624
3625    // Try SIMD fast path for single-range constant-offset translations
3626    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3627        return translate_mmap_range(data, writer, lo, hi, offset);
3628    }
3629
3630    // Try SIMD fast path for range-to-constant translations
3631    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3632        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3633    }
3634
3635    // General case: table lookup (with parallel processing for large data)
3636    translate_mmap_table(data, writer, &table)
3637}
3638
3639/// SIMD range translate for mmap data, with rayon parallel processing.
3640fn translate_mmap_range(
3641    data: &[u8],
3642    writer: &mut impl Write,
3643    lo: u8,
3644    hi: u8,
3645    offset: i8,
3646) -> io::Result<()> {
3647    // Parallel path: split data into chunks, translate each in parallel
3648    if data.len() >= PARALLEL_THRESHOLD {
3649        let mut buf = alloc_uninit_vec(data.len());
3650        let n_threads = rayon::current_num_threads().max(1);
3651        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3652
3653        // Process chunks in parallel: each thread writes to its slice of buf
3654        data.par_chunks(chunk_size)
3655            .zip(buf.par_chunks_mut(chunk_size))
3656            .for_each(|(src_chunk, dst_chunk)| {
3657                translate_range_simd(src_chunk, &mut dst_chunk[..src_chunk.len()], lo, hi, offset);
3658            });
3659
3660        return writer.write_all(&buf);
3661    }
3662
3663    // Chunked SIMD translate: 256KB buffer fits in L2 cache.
3664    const CHUNK: usize = 256 * 1024;
3665    let buf_size = data.len().min(CHUNK);
3666    let mut buf = alloc_uninit_vec(buf_size);
3667    for chunk in data.chunks(CHUNK) {
3668        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3669        writer.write_all(&buf[..chunk.len()])?;
3670    }
3671    Ok(())
3672}
3673
3674/// SIMD range-to-constant translate for mmap data.
3675/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
3676fn translate_mmap_range_to_constant(
3677    data: &[u8],
3678    writer: &mut impl Write,
3679    lo: u8,
3680    hi: u8,
3681    replacement: u8,
3682) -> io::Result<()> {
3683    // For mmap data (read-only), copy to buffer and translate in-place
3684    if data.len() >= PARALLEL_THRESHOLD {
3685        let mut buf = alloc_uninit_vec(data.len());
3686        let n_threads = rayon::current_num_threads().max(1);
3687        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3688
3689        // Copy + translate in parallel
3690        data.par_chunks(chunk_size)
3691            .zip(buf.par_chunks_mut(chunk_size))
3692            .for_each(|(src_chunk, dst_chunk)| {
3693                dst_chunk[..src_chunk.len()].copy_from_slice(src_chunk);
3694                translate_range_to_constant_simd_inplace(
3695                    &mut dst_chunk[..src_chunk.len()],
3696                    lo,
3697                    hi,
3698                    replacement,
3699                );
3700            });
3701
3702        return writer.write_all(&buf);
3703    }
3704
3705    // Chunked translate: 256KB buffer fits in L2 cache.
3706    const CHUNK: usize = 256 * 1024;
3707    let buf_size = data.len().min(CHUNK);
3708    let mut buf = alloc_uninit_vec(buf_size);
3709    for chunk in data.chunks(CHUNK) {
3710        buf[..chunk.len()].copy_from_slice(chunk);
3711        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3712        writer.write_all(&buf[..chunk.len()])?;
3713    }
3714    Ok(())
3715}
3716
3717/// General table-lookup translate for mmap data, with rayon parallel processing.
3718fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3719    // Parallel path: split data into chunks, translate each in parallel
3720    if data.len() >= PARALLEL_THRESHOLD {
3721        let mut buf = alloc_uninit_vec(data.len());
3722        let n_threads = rayon::current_num_threads().max(1);
3723        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3724
3725        data.par_chunks(chunk_size)
3726            .zip(buf.par_chunks_mut(chunk_size))
3727            .for_each(|(src_chunk, dst_chunk)| {
3728                translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], table);
3729            });
3730
3731        return writer.write_all(&buf);
3732    }
3733
3734    // Chunked translate: 256KB buffer fits in L2 cache.
3735    const CHUNK: usize = 256 * 1024;
3736    let buf_size = data.len().min(CHUNK);
3737    let mut buf = alloc_uninit_vec(buf_size);
3738    for chunk in data.chunks(CHUNK) {
3739        translate_to(chunk, &mut buf[..chunk.len()], table);
3740        writer.write_all(&buf[..chunk.len()])?;
3741    }
3742    Ok(())
3743}
3744
3745/// Translate bytes in-place on a mutable buffer (e.g., MAP_PRIVATE mmap).
3746/// Eliminates the output buffer allocation entirely — the kernel's COW
3747/// semantics mean only modified pages are physically copied.
3748///
3749/// For data >= PARALLEL_THRESHOLD: rayon parallel in-place translate.
3750/// Otherwise: single-threaded in-place translate.
3751pub fn translate_mmap_inplace(
3752    set1: &[u8],
3753    set2: &[u8],
3754    data: &mut [u8],
3755    writer: &mut impl Write,
3756) -> io::Result<()> {
3757    let table = build_translate_table(set1, set2);
3758
3759    // Check if table is identity — pure passthrough
3760    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3761    if is_identity {
3762        return writer.write_all(data);
3763    }
3764
3765    // For data that's being translated in a MAP_PRIVATE mmap, every modified page
3766    // triggers a COW fault. For small-to-medium files where most bytes change,
3767    // reading from mmap (read-only) + writing to a separate heap buffer is faster
3768    // because it avoids COW faults entirely. The output buffer is fresh memory
3769    // (no COW), and the input mmap stays read-only (MADV_SEQUENTIAL).
3770    // Threshold: 64MB. For benchmark-sized files (10MB), avoid COW entirely.
3771    const SEPARATE_BUF_THRESHOLD: usize = 64 * 1024 * 1024;
3772
3773    if data.len() < SEPARATE_BUF_THRESHOLD {
3774        return translate_to_separate_buf(data, &table, writer);
3775    }
3776
3777    // Try SIMD fast path for single-range constant-offset translations (e.g., a-z -> A-Z)
3778    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3779        if data.len() >= PARALLEL_THRESHOLD {
3780            let n_threads = rayon::current_num_threads().max(1);
3781            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3782            data.par_chunks_mut(chunk_size)
3783                .for_each(|chunk| translate_range_simd_inplace(chunk, lo, hi, offset));
3784        } else {
3785            translate_range_simd_inplace(data, lo, hi, offset);
3786        }
3787        return writer.write_all(data);
3788    }
3789
3790    // Try SIMD fast path for range-to-constant translations
3791    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3792        if data.len() >= PARALLEL_THRESHOLD {
3793            let n_threads = rayon::current_num_threads().max(1);
3794            let chunk_size = (data.len() / n_threads).max(32 * 1024);
3795            data.par_chunks_mut(chunk_size).for_each(|chunk| {
3796                translate_range_to_constant_simd_inplace(chunk, lo, hi, replacement)
3797            });
3798        } else {
3799            translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3800        }
3801        return writer.write_all(data);
3802    }
3803
3804    // General case: in-place table lookup
3805    if data.len() >= PARALLEL_THRESHOLD {
3806        let n_threads = rayon::current_num_threads().max(1);
3807        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3808        data.par_chunks_mut(chunk_size)
3809            .for_each(|chunk| translate_inplace(chunk, &table));
3810    } else {
3811        translate_inplace(data, &table);
3812    }
3813    writer.write_all(data)
3814}
3815
3816/// Translate from read-only source to a separate output buffer, avoiding COW faults.
3817/// Uses the appropriate SIMD path (range offset, range-to-constant, or general nibble).
3818///
3819/// For data >= PARALLEL_THRESHOLD: parallel chunked translate into full-size buffer.
3820/// For smaller data: single full-size allocation + single write_all for minimum
3821/// syscall overhead. At 10MB, the allocation is cheap and a single write() is faster
3822/// than multiple 4MB chunked writes.
3823fn translate_to_separate_buf(
3824    data: &[u8],
3825    table: &[u8; 256],
3826    writer: &mut impl Write,
3827) -> io::Result<()> {
3828    let range_info = detect_range_offset(table);
3829    let const_info = if range_info.is_none() {
3830        detect_range_to_constant(table)
3831    } else {
3832        None
3833    };
3834
3835    if data.len() >= PARALLEL_THRESHOLD {
3836        // Parallel path: full-size output buffer, parallel translate, single write.
3837        let mut out_buf = alloc_uninit_vec(data.len());
3838        let n_threads = rayon::current_num_threads().max(1);
3839        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3840
3841        if let Some((lo, hi, offset)) = range_info {
3842            data.par_chunks(chunk_size)
3843                .zip(out_buf.par_chunks_mut(chunk_size))
3844                .for_each(|(src, dst)| {
3845                    translate_range_simd(src, &mut dst[..src.len()], lo, hi, offset);
3846                });
3847        } else if let Some((lo, hi, replacement)) = const_info {
3848            data.par_chunks(chunk_size)
3849                .zip(out_buf.par_chunks_mut(chunk_size))
3850                .for_each(|(src, dst)| {
3851                    translate_range_to_constant_simd(
3852                        src,
3853                        &mut dst[..src.len()],
3854                        lo,
3855                        hi,
3856                        replacement,
3857                    );
3858                });
3859        } else {
3860            data.par_chunks(chunk_size)
3861                .zip(out_buf.par_chunks_mut(chunk_size))
3862                .for_each(|(src, dst)| {
3863                    translate_to(src, &mut dst[..src.len()], table);
3864                });
3865        }
3866        return writer.write_all(&out_buf);
3867    }
3868
3869    // Single-allocation translate: full-size output buffer, single translate, single write.
3870    // For 10MB data, this does 1 write() instead of 40 chunked writes, eliminating
3871    // 39 write() syscalls. SIMD translate streams through src and dst sequentially,
3872    // so the L2 cache argument for 256KB chunks doesn't apply (src data doesn't fit
3873    // in L2 anyway). The reduced syscall overhead more than compensates.
3874    let mut out_buf = alloc_uninit_vec(data.len());
3875    if let Some((lo, hi, offset)) = range_info {
3876        translate_range_simd(data, &mut out_buf, lo, hi, offset);
3877    } else if let Some((lo, hi, replacement)) = const_info {
3878        translate_range_to_constant_simd(data, &mut out_buf, lo, hi, replacement);
3879    } else {
3880        translate_to(data, &mut out_buf, table);
3881    }
3882    writer.write_all(&out_buf)
3883}
3884
3885/// Translate from a read-only mmap (or any byte slice) to a separate output buffer.
3886/// Avoids MAP_PRIVATE COW page faults by reading from the original data and
3887/// writing to a freshly allocated heap buffer.
3888pub fn translate_mmap_readonly(
3889    set1: &[u8],
3890    set2: &[u8],
3891    data: &[u8],
3892    writer: &mut impl Write,
3893) -> io::Result<()> {
3894    let table = build_translate_table(set1, set2);
3895
3896    // Check if table is identity — pure passthrough
3897    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3898    if is_identity {
3899        return writer.write_all(data);
3900    }
3901
3902    translate_to_separate_buf(data, &table, writer)
3903}
3904
3905/// Translate + squeeze from mmap'd byte slice.
3906///
3907/// For data >= 2MB: two-phase approach: parallel translate, then sequential squeeze.
3908/// For data <= 16MB: single-pass translate+squeeze into one buffer, one write syscall.
3909/// For data > 16MB: chunked approach to limit memory.
3910pub fn translate_squeeze_mmap(
3911    set1: &[u8],
3912    set2: &[u8],
3913    data: &[u8],
3914    writer: &mut impl Write,
3915) -> io::Result<()> {
3916    let table = build_translate_table(set1, set2);
3917    let squeeze_set = build_member_set(set2);
3918
3919    // For large data: two-phase approach
3920    // Phase 1: parallel translate into buffer
3921    // Phase 2: sequential squeeze IN-PLACE on the translated buffer
3922    //          (squeeze only removes bytes, never grows, so no second allocation needed)
3923    if data.len() >= PARALLEL_THRESHOLD {
3924        // Phase 1: parallel translate
3925        let mut translated = alloc_uninit_vec(data.len());
3926        let range_info = detect_range_offset(&table);
3927        let n_threads = rayon::current_num_threads().max(1);
3928        let chunk_size = (data.len() / n_threads).max(32 * 1024);
3929
3930        if let Some((lo, hi, offset)) = range_info {
3931            data.par_chunks(chunk_size)
3932                .zip(translated.par_chunks_mut(chunk_size))
3933                .for_each(|(src_chunk, dst_chunk)| {
3934                    translate_range_simd(
3935                        src_chunk,
3936                        &mut dst_chunk[..src_chunk.len()],
3937                        lo,
3938                        hi,
3939                        offset,
3940                    );
3941                });
3942        } else {
3943            data.par_chunks(chunk_size)
3944                .zip(translated.par_chunks_mut(chunk_size))
3945                .for_each(|(src_chunk, dst_chunk)| {
3946                    translate_to(src_chunk, &mut dst_chunk[..src_chunk.len()], &table);
3947                });
3948        }
3949
3950        // Phase 2: squeeze in-place on the translated buffer.
3951        // Since squeeze only removes bytes (never grows), we can read ahead and
3952        // compact into the same buffer, saving a full data.len() heap allocation.
3953        let mut last_squeezed: u16 = 256;
3954        let len = translated.len();
3955        let mut wp = 0;
3956        unsafe {
3957            let ptr = translated.as_mut_ptr();
3958            let mut i = 0;
3959            while i < len {
3960                let b = *ptr.add(i);
3961                if is_member(&squeeze_set, b) {
3962                    if last_squeezed == b as u16 {
3963                        i += 1;
3964                        continue;
3965                    }
3966                    last_squeezed = b as u16;
3967                } else {
3968                    last_squeezed = 256;
3969                }
3970                *ptr.add(wp) = b;
3971                wp += 1;
3972                i += 1;
3973            }
3974        }
3975        return writer.write_all(&translated[..wp]);
3976    }
3977
3978    // Single-allocation translate+squeeze: full-size buffer, single write_all.
3979    // For 10MB data, this does 1 write() instead of ~40 chunked writes.
3980    let mut buf = alloc_uninit_vec(data.len());
3981    translate_to(data, &mut buf, &table);
3982    let mut last_squeezed: u16 = 256;
3983    let mut wp = 0;
3984    unsafe {
3985        let ptr = buf.as_mut_ptr();
3986        for i in 0..data.len() {
3987            let b = *ptr.add(i);
3988            if is_member(&squeeze_set, b) {
3989                if last_squeezed == b as u16 {
3990                    continue;
3991                }
3992                last_squeezed = b as u16;
3993            } else {
3994                last_squeezed = 256;
3995            }
3996            *ptr.add(wp) = b;
3997            wp += 1;
3998        }
3999    }
4000    writer.write_all(&buf[..wp])
4001}
4002
4003/// Delete from mmap'd byte slice.
4004///
4005/// For data >= 2MB: uses rayon parallel processing across multiple cores.
4006/// For data <= 16MB: delete into one buffer, one write syscall.
4007/// For data > 16MB: chunked approach to limit memory.
4008pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4009    if delete_chars.len() == 1 {
4010        return delete_single_char_mmap(delete_chars[0], data, writer);
4011    }
4012    if delete_chars.len() <= 3 {
4013        return delete_multi_memchr_mmap(delete_chars, data, writer);
4014    }
4015
4016    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
4017    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4018        return delete_range_mmap(data, writer, lo, hi);
4019    }
4020
4021    let member = build_member_set(delete_chars);
4022
4023    // Heuristic: estimate total delete positions. Zero-copy writev is only efficient
4024    // when all gaps fit in a single writev call (< MAX_IOV/2 entries). With uniform
4025    // distribution, each delete creates an IoSlice entry. For many deletes (> 512),
4026    // multiple writev calls are needed, and the compact approach is faster.
4027    let sample_size = data.len().min(1024);
4028    let sample_deletes = data[..sample_size]
4029        .iter()
4030        .filter(|&&b| is_member(&member, b))
4031        .count();
4032    let estimated_deletes = if sample_size > 0 {
4033        data.len() * sample_deletes / sample_size
4034    } else {
4035        data.len()
4036    };
4037
4038    if estimated_deletes < MAX_IOV / 2 {
4039        return delete_bitset_zerocopy(data, &member, writer);
4040    }
4041
4042    // Dense delete: parallel compact with writev (avoids scatter-gather copy)
4043    if data.len() >= PARALLEL_THRESHOLD {
4044        let n_threads = rayon::current_num_threads().max(1);
4045        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4046
4047        let mut outbuf = alloc_uninit_vec(data.len());
4048        let chunk_lens: Vec<usize> = data
4049            .par_chunks(chunk_size)
4050            .zip(outbuf.par_chunks_mut(chunk_size))
4051            .map(|(src_chunk, dst_chunk)| delete_chunk_bitset_into(src_chunk, &member, dst_chunk))
4052            .collect();
4053
4054        // Use writev to write each chunk at its original position, avoiding
4055        // the O(N) scatter-gather memmove. With ~4 threads, that's 4 IoSlice
4056        // entries — far below MAX_IOV.
4057        let slices: Vec<std::io::IoSlice> = chunk_lens
4058            .iter()
4059            .enumerate()
4060            .filter(|&(_, &len)| len > 0)
4061            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4062            .collect();
4063        return write_ioslices(writer, &slices);
4064    }
4065
4066    // Streaming compact: 256KB output buffer reduces page fault overhead.
4067    // For 10MB data: ~64 page faults instead of ~2500, with ~40 write_all calls.
4068    const COMPACT_BUF: usize = 256 * 1024;
4069    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4070
4071    for chunk in data.chunks(COMPACT_BUF) {
4072        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4073        if out_pos > 0 {
4074            writer.write_all(&outbuf[..out_pos])?;
4075        }
4076    }
4077    Ok(())
4078}
4079
4080/// SIMD range delete for mmap data.
4081/// Uses a density heuristic: for sparse deletes (< 15%), uses zero-copy writev
4082/// directly from mmap data (no output buffer allocation). For dense deletes,
4083/// uses SIMD compact into a pre-allocated buffer.
4084fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4085    // Sample first 1024 bytes to estimate delete density
4086    let sample_size = data.len().min(1024);
4087    let sample_deletes = data[..sample_size]
4088        .iter()
4089        .filter(|&&b| b >= lo && b <= hi)
4090        .count();
4091    // Estimate expected number of delete positions (IoSlice entries for zero-copy).
4092    // Each delete creates an IoSlice entry. With MAX_IOV=1024 per writev,
4093    // if estimated_deletes > MAX_IOV/2, the writev overhead from multiple syscalls
4094    // exceeds the compact approach cost. Only use zero-copy when all gaps fit in
4095    // a single writev call.
4096    let estimated_deletes = if sample_size > 0 {
4097        data.len() * sample_deletes / sample_size
4098    } else {
4099        data.len()
4100    };
4101    if estimated_deletes < MAX_IOV / 2 {
4102        return delete_range_mmap_zerocopy(data, writer, lo, hi);
4103    }
4104
4105    // Dense deletes: parallel compact with writev (avoids scatter-gather copy)
4106    if data.len() >= PARALLEL_THRESHOLD {
4107        let n_threads = rayon::current_num_threads().max(1);
4108        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4109
4110        let mut outbuf = alloc_uninit_vec(data.len());
4111        let chunk_lens: Vec<usize> = data
4112            .par_chunks(chunk_size)
4113            .zip(outbuf.par_chunks_mut(chunk_size))
4114            .map(|(src_chunk, dst_chunk)| delete_range_chunk(src_chunk, dst_chunk, lo, hi))
4115            .collect();
4116
4117        // Use writev to write each chunk at its original position, avoiding
4118        // the O(N) scatter-gather memmove.
4119        let slices: Vec<std::io::IoSlice> = chunk_lens
4120            .iter()
4121            .enumerate()
4122            .filter(|&(_, &len)| len > 0)
4123            .map(|(i, &len)| std::io::IoSlice::new(&outbuf[i * chunk_size..i * chunk_size + len]))
4124            .collect();
4125        return write_ioslices(writer, &slices);
4126    }
4127
4128    // Streaming compact: use 256KB output buffer instead of full data.len() buffer.
4129    // This reduces page fault overhead from ~2500 faults (10MB) to ~64 faults (256KB).
4130    // The extra write_all calls (~40 for 10MB) are negligible cost.
4131    const COMPACT_BUF: usize = 256 * 1024;
4132    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4133
4134    #[cfg(target_arch = "x86_64")]
4135    {
4136        let mut wp = 0;
4137        let level = get_simd_level();
4138        let len = data.len();
4139        let sp = data.as_ptr();
4140        let dp = outbuf.as_mut_ptr();
4141        let mut ri = 0;
4142
4143        if level >= 3 {
4144            use std::arch::x86_64::*;
4145            let range = hi - lo;
4146            let bias_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8) };
4147            let threshold_v = unsafe { _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8) };
4148            let zero = unsafe { _mm256_setzero_si256() };
4149
4150            while ri + 32 <= len {
4151                // Flush when output buffer is nearly full
4152                if wp + 32 > COMPACT_BUF {
4153                    writer.write_all(&outbuf[..wp])?;
4154                    wp = 0;
4155                }
4156
4157                let input = unsafe { _mm256_loadu_si256(sp.add(ri) as *const _) };
4158                let biased = unsafe { _mm256_add_epi8(input, bias_v) };
4159                let gt = unsafe { _mm256_cmpgt_epi8(biased, threshold_v) };
4160                let in_range = unsafe { _mm256_cmpeq_epi8(gt, zero) };
4161                let keep_mask = !(unsafe { _mm256_movemask_epi8(in_range) } as u32);
4162
4163                if keep_mask == 0xFFFFFFFF {
4164                    unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32) };
4165                    wp += 32;
4166                } else if keep_mask != 0 {
4167                    let m0 = keep_mask as u8;
4168                    let m1 = (keep_mask >> 8) as u8;
4169                    let m2 = (keep_mask >> 16) as u8;
4170                    let m3 = (keep_mask >> 24) as u8;
4171
4172                    if m0 == 0xFF {
4173                        unsafe { std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8) };
4174                    } else if m0 != 0 {
4175                        unsafe { compact_8bytes_simd(sp.add(ri), dp.add(wp), m0) };
4176                    }
4177                    let c0 = m0.count_ones() as usize;
4178
4179                    if m1 == 0xFF {
4180                        unsafe {
4181                            std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8)
4182                        };
4183                    } else if m1 != 0 {
4184                        unsafe { compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1) };
4185                    }
4186                    let c1 = m1.count_ones() as usize;
4187
4188                    if m2 == 0xFF {
4189                        unsafe {
4190                            std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8)
4191                        };
4192                    } else if m2 != 0 {
4193                        unsafe { compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2) };
4194                    }
4195                    let c2 = m2.count_ones() as usize;
4196
4197                    if m3 == 0xFF {
4198                        unsafe {
4199                            std::ptr::copy_nonoverlapping(
4200                                sp.add(ri + 24),
4201                                dp.add(wp + c0 + c1 + c2),
4202                                8,
4203                            )
4204                        };
4205                    } else if m3 != 0 {
4206                        unsafe {
4207                            compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3)
4208                        };
4209                    }
4210                    let c3 = m3.count_ones() as usize;
4211                    wp += c0 + c1 + c2 + c3;
4212                }
4213                ri += 32;
4214            }
4215        }
4216
4217        // Scalar tail
4218        while ri < len {
4219            if wp + 1 > COMPACT_BUF {
4220                writer.write_all(&outbuf[..wp])?;
4221                wp = 0;
4222            }
4223            let b = unsafe { *sp.add(ri) };
4224            unsafe { *dp.add(wp) = b };
4225            wp += (b < lo || b > hi) as usize;
4226            ri += 1;
4227        }
4228
4229        if wp > 0 {
4230            writer.write_all(&outbuf[..wp])?;
4231        }
4232        return Ok(());
4233    }
4234
4235    #[cfg(not(target_arch = "x86_64"))]
4236    {
4237        // Non-x86 fallback: chunk the source and process with delete_range_chunk
4238        for chunk in data.chunks(COMPACT_BUF) {
4239            let clen = delete_range_chunk(chunk, &mut outbuf, lo, hi);
4240            if clen > 0 {
4241                writer.write_all(&outbuf[..clen])?;
4242            }
4243        }
4244        return Ok(());
4245    }
4246
4247    #[allow(unreachable_code)]
4248    Ok(())
4249}
4250
4251/// Zero-copy range delete for mmap data: SIMD-scans for bytes in [lo..=hi],
4252/// builds IoSlice entries pointing to the gaps between deleted ranges in the
4253/// original mmap data, and writes using writev. No output buffer allocation.
4254/// For 10MB text with 4% digits: ~1.5ms vs ~4ms for the compact approach.
4255fn delete_range_mmap_zerocopy(
4256    data: &[u8],
4257    writer: &mut impl Write,
4258    lo: u8,
4259    hi: u8,
4260) -> io::Result<()> {
4261    #[cfg(target_arch = "x86_64")]
4262    {
4263        if get_simd_level() >= 3 {
4264            return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4265        }
4266        if get_simd_level() >= 2 {
4267            return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4268        }
4269    }
4270
4271    #[cfg(target_arch = "aarch64")]
4272    {
4273        return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4274    }
4275
4276    // Scalar fallback: byte-by-byte scan with IoSlice batching
4277    #[allow(unreachable_code)]
4278    delete_range_zerocopy_scalar(data, writer, lo, hi)
4279}
4280
4281/// Scalar zero-copy range delete: byte-by-byte scan with IoSlice batching.
4282/// Used as fallback when SIMD is unavailable.
4283fn delete_range_zerocopy_scalar(
4284    data: &[u8],
4285    writer: &mut impl Write,
4286    lo: u8,
4287    hi: u8,
4288) -> io::Result<()> {
4289    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4290    let len = data.len();
4291    let mut run_start: usize = 0;
4292    let mut i: usize = 0;
4293
4294    while i < len {
4295        let b = unsafe { *data.get_unchecked(i) };
4296        if b >= lo && b <= hi {
4297            if i > run_start {
4298                iov.push(std::io::IoSlice::new(&data[run_start..i]));
4299                if iov.len() >= MAX_IOV {
4300                    write_ioslices(writer, &iov)?;
4301                    iov.clear();
4302                }
4303            }
4304            run_start = i + 1;
4305        }
4306        i += 1;
4307    }
4308    if run_start < len {
4309        iov.push(std::io::IoSlice::new(&data[run_start..]));
4310    }
4311    if !iov.is_empty() {
4312        write_ioslices(writer, &iov)?;
4313    }
4314    Ok(())
4315}
4316
4317/// AVX2 zero-copy range delete: scans 32 bytes at a time using SIMD range
4318/// comparison, then iterates only the delete positions from the bitmask.
4319/// Blocks with no deletes (common for sparse data) skip with zero per-byte work.
4320#[cfg(target_arch = "x86_64")]
4321#[target_feature(enable = "avx2")]
4322unsafe fn delete_range_zerocopy_avx2(
4323    data: &[u8],
4324    writer: &mut impl Write,
4325    lo: u8,
4326    hi: u8,
4327) -> io::Result<()> {
4328    use std::arch::x86_64::*;
4329
4330    unsafe {
4331        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4332        let len = data.len();
4333        let mut run_start: usize = 0;
4334        let mut ri: usize = 0;
4335
4336        let range = hi - lo;
4337        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4338        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4339        let zero = _mm256_setzero_si256();
4340
4341        while ri + 32 <= len {
4342            let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4343            let biased = _mm256_add_epi8(input, bias_v);
4344            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4345            let in_range = _mm256_cmpeq_epi8(gt, zero);
4346            let del_mask = _mm256_movemask_epi8(in_range) as u32;
4347
4348            if del_mask == 0 {
4349                // No bytes to delete — run continues
4350                ri += 32;
4351                continue;
4352            }
4353
4354            // Process each deleted byte position from the bitmask
4355            let mut m = del_mask;
4356            while m != 0 {
4357                let bit = m.trailing_zeros() as usize;
4358                let abs_pos = ri + bit;
4359                if abs_pos > run_start {
4360                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4361                    if iov.len() >= MAX_IOV {
4362                        write_ioslices(writer, &iov)?;
4363                        iov.clear();
4364                    }
4365                }
4366                run_start = abs_pos + 1;
4367                m &= m - 1; // clear lowest set bit (blsr)
4368            }
4369
4370            ri += 32;
4371        }
4372
4373        // Scalar tail
4374        while ri < len {
4375            let b = *data.get_unchecked(ri);
4376            if b >= lo && b <= hi {
4377                if ri > run_start {
4378                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4379                    if iov.len() >= MAX_IOV {
4380                        write_ioslices(writer, &iov)?;
4381                        iov.clear();
4382                    }
4383                }
4384                run_start = ri + 1;
4385            }
4386            ri += 1;
4387        }
4388
4389        if run_start < len {
4390            iov.push(std::io::IoSlice::new(&data[run_start..]));
4391        }
4392        if !iov.is_empty() {
4393            write_ioslices(writer, &iov)?;
4394        }
4395        Ok(())
4396    }
4397}
4398
4399/// SSE2 zero-copy range delete: same approach as AVX2 but with 16-byte blocks.
4400#[cfg(target_arch = "x86_64")]
4401#[target_feature(enable = "sse2")]
4402unsafe fn delete_range_zerocopy_sse2(
4403    data: &[u8],
4404    writer: &mut impl Write,
4405    lo: u8,
4406    hi: u8,
4407) -> io::Result<()> {
4408    use std::arch::x86_64::*;
4409
4410    unsafe {
4411        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4412        let len = data.len();
4413        let mut run_start: usize = 0;
4414        let mut ri: usize = 0;
4415
4416        let range = hi - lo;
4417        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4418        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4419        let zero = _mm_setzero_si128();
4420
4421        while ri + 16 <= len {
4422            let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4423            let biased = _mm_add_epi8(input, bias_v);
4424            let gt = _mm_cmpgt_epi8(biased, threshold_v);
4425            let in_range = _mm_cmpeq_epi8(gt, zero);
4426            let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4427
4428            if del_mask == 0 {
4429                ri += 16;
4430                continue;
4431            }
4432
4433            let mut m = del_mask;
4434            while m != 0 {
4435                let bit = m.trailing_zeros() as usize;
4436                let abs_pos = ri + bit;
4437                if abs_pos > run_start {
4438                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4439                    if iov.len() >= MAX_IOV {
4440                        write_ioslices(writer, &iov)?;
4441                        iov.clear();
4442                    }
4443                }
4444                run_start = abs_pos + 1;
4445                m &= m - 1;
4446            }
4447
4448            ri += 16;
4449        }
4450
4451        while ri < len {
4452            let b = *data.get_unchecked(ri);
4453            if b >= lo && b <= hi {
4454                if ri > run_start {
4455                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4456                    if iov.len() >= MAX_IOV {
4457                        write_ioslices(writer, &iov)?;
4458                        iov.clear();
4459                    }
4460                }
4461                run_start = ri + 1;
4462            }
4463            ri += 1;
4464        }
4465
4466        if run_start < len {
4467            iov.push(std::io::IoSlice::new(&data[run_start..]));
4468        }
4469        if !iov.is_empty() {
4470            write_ioslices(writer, &iov)?;
4471        }
4472        Ok(())
4473    }
4474}
4475
4476/// NEON zero-copy range delete for aarch64: scans 16 bytes at a time using
4477/// NEON unsigned comparison, creates bitmask via pairwise narrowing, then
4478/// iterates delete positions from the bitmask.
4479#[cfg(target_arch = "aarch64")]
4480#[target_feature(enable = "neon")]
4481unsafe fn delete_range_zerocopy_neon(
4482    data: &[u8],
4483    writer: &mut impl Write,
4484    lo: u8,
4485    hi: u8,
4486) -> io::Result<()> {
4487    use std::arch::aarch64::*;
4488
4489    unsafe {
4490        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4491        let len = data.len();
4492        let mut run_start: usize = 0;
4493        let mut ri: usize = 0;
4494
4495        let lo_v = vdupq_n_u8(lo);
4496        let hi_v = vdupq_n_u8(hi);
4497        // Bit position mask for extracting bitmask from comparison results
4498        let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4499        let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4500
4501        while ri + 16 <= len {
4502            let input = vld1q_u8(data.as_ptr().add(ri));
4503            // in_range = 0xFF where lo <= byte <= hi
4504            let ge_lo = vcgeq_u8(input, lo_v);
4505            let le_hi = vcleq_u8(input, hi_v);
4506            let in_range = vandq_u8(ge_lo, le_hi);
4507
4508            // Create 16-bit bitmask: reduce 16 bytes to 2 bytes
4509            let bits = vandq_u8(in_range, bit_mask_v);
4510            let pair = vpaddlq_u8(bits); // u8→u16 pairwise add
4511            let quad = vpaddlq_u16(pair); // u16→u32
4512            let octet = vpaddlq_u32(quad); // u32→u64
4513            let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4514            let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4515            let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4516
4517            if del_mask == 0 {
4518                // No bytes to delete — run continues
4519                ri += 16;
4520                continue;
4521            }
4522
4523            // Process each deleted byte position
4524            let mut m = del_mask;
4525            while m != 0 {
4526                let bit = m.trailing_zeros() as usize;
4527                let abs_pos = ri + bit;
4528                if abs_pos > run_start {
4529                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4530                    if iov.len() >= MAX_IOV {
4531                        write_ioslices(writer, &iov)?;
4532                        iov.clear();
4533                    }
4534                }
4535                run_start = abs_pos + 1;
4536                m &= m - 1;
4537            }
4538
4539            ri += 16;
4540        }
4541
4542        // Scalar tail
4543        while ri < len {
4544            let b = *data.get_unchecked(ri);
4545            if b >= lo && b <= hi {
4546                if ri > run_start {
4547                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4548                    if iov.len() >= MAX_IOV {
4549                        write_ioslices(writer, &iov)?;
4550                        iov.clear();
4551                    }
4552                }
4553                run_start = ri + 1;
4554            }
4555            ri += 1;
4556        }
4557
4558        if run_start < len {
4559            iov.push(std::io::IoSlice::new(&data[run_start..]));
4560        }
4561        if !iov.is_empty() {
4562            write_ioslices(writer, &iov)?;
4563        }
4564        Ok(())
4565    }
4566}
4567
4568/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
4569/// Returns number of bytes written.
4570#[inline]
4571fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4572    let len = chunk.len();
4573    let mut out_pos = 0;
4574    let mut i = 0;
4575
4576    while i + 8 <= len {
4577        unsafe {
4578            let b0 = *chunk.get_unchecked(i);
4579            let b1 = *chunk.get_unchecked(i + 1);
4580            let b2 = *chunk.get_unchecked(i + 2);
4581            let b3 = *chunk.get_unchecked(i + 3);
4582            let b4 = *chunk.get_unchecked(i + 4);
4583            let b5 = *chunk.get_unchecked(i + 5);
4584            let b6 = *chunk.get_unchecked(i + 6);
4585            let b7 = *chunk.get_unchecked(i + 7);
4586
4587            *outbuf.get_unchecked_mut(out_pos) = b0;
4588            out_pos += !is_member(member, b0) as usize;
4589            *outbuf.get_unchecked_mut(out_pos) = b1;
4590            out_pos += !is_member(member, b1) as usize;
4591            *outbuf.get_unchecked_mut(out_pos) = b2;
4592            out_pos += !is_member(member, b2) as usize;
4593            *outbuf.get_unchecked_mut(out_pos) = b3;
4594            out_pos += !is_member(member, b3) as usize;
4595            *outbuf.get_unchecked_mut(out_pos) = b4;
4596            out_pos += !is_member(member, b4) as usize;
4597            *outbuf.get_unchecked_mut(out_pos) = b5;
4598            out_pos += !is_member(member, b5) as usize;
4599            *outbuf.get_unchecked_mut(out_pos) = b6;
4600            out_pos += !is_member(member, b6) as usize;
4601            *outbuf.get_unchecked_mut(out_pos) = b7;
4602            out_pos += !is_member(member, b7) as usize;
4603        }
4604        i += 8;
4605    }
4606
4607    while i < len {
4608        unsafe {
4609            let b = *chunk.get_unchecked(i);
4610            *outbuf.get_unchecked_mut(out_pos) = b;
4611            out_pos += !is_member(member, b) as usize;
4612        }
4613        i += 1;
4614    }
4615
4616    out_pos
4617}
4618
4619/// Zero-copy delete for general bitset: scan for runs of kept bytes,
4620/// build IoSlice entries pointing directly into the source data.
4621/// No allocation for output data — just ~16 bytes per IoSlice entry.
4622/// Flushes in MAX_IOV-sized batches for efficient writev.
4623fn delete_bitset_zerocopy(
4624    data: &[u8],
4625    member: &[u8; 32],
4626    writer: &mut impl Write,
4627) -> io::Result<()> {
4628    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4629    let len = data.len();
4630    let mut i = 0;
4631    let mut run_start: Option<usize> = None;
4632
4633    while i < len {
4634        let b = unsafe { *data.get_unchecked(i) };
4635        if is_member(member, b) {
4636            // This byte should be deleted
4637            if let Some(rs) = run_start {
4638                iov.push(std::io::IoSlice::new(&data[rs..i]));
4639                run_start = None;
4640                if iov.len() >= MAX_IOV {
4641                    write_ioslices(writer, &iov)?;
4642                    iov.clear();
4643                }
4644            }
4645        } else {
4646            // This byte should be kept
4647            if run_start.is_none() {
4648                run_start = Some(i);
4649            }
4650        }
4651        i += 1;
4652    }
4653    // Flush final run
4654    if let Some(rs) = run_start {
4655        iov.push(std::io::IoSlice::new(&data[rs..]));
4656    }
4657    if !iov.is_empty() {
4658        write_ioslices(writer, &iov)?;
4659    }
4660    Ok(())
4661}
4662
4663fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4664    // Streaming zero-copy delete using writev: build IoSlice batches of MAX_IOV
4665    // pointing to gaps between deleted characters, write each batch immediately.
4666    // Avoids allocating the full Vec<IoSlice> for all positions.
4667    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4668    let mut last = 0;
4669    for pos in memchr::memchr_iter(ch, data) {
4670        if pos > last {
4671            iov.push(std::io::IoSlice::new(&data[last..pos]));
4672            if iov.len() >= MAX_IOV {
4673                write_ioslices(writer, &iov)?;
4674                iov.clear();
4675            }
4676        }
4677        last = pos + 1;
4678    }
4679    if last < data.len() {
4680        iov.push(std::io::IoSlice::new(&data[last..]));
4681    }
4682    if !iov.is_empty() {
4683        write_ioslices(writer, &iov)?;
4684    }
4685    Ok(())
4686}
4687
4688fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4689    let c0 = chars[0];
4690    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4691    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4692    let is_three = chars.len() >= 3;
4693
4694    // Streaming zero-copy delete: batch IoSlice entries and write in groups of MAX_IOV.
4695    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4696    let mut last = 0;
4697
4698    macro_rules! process_pos {
4699        ($pos:expr) => {
4700            if $pos > last {
4701                iov.push(std::io::IoSlice::new(&data[last..$pos]));
4702                if iov.len() >= MAX_IOV {
4703                    write_ioslices(writer, &iov)?;
4704                    iov.clear();
4705                }
4706            }
4707            last = $pos + 1;
4708        };
4709    }
4710
4711    if is_three {
4712        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4713            process_pos!(pos);
4714        }
4715    } else {
4716        for pos in memchr::memchr2_iter(c0, c1, data) {
4717            process_pos!(pos);
4718        }
4719    }
4720    if last < data.len() {
4721        iov.push(std::io::IoSlice::new(&data[last..]));
4722    }
4723    if !iov.is_empty() {
4724        write_ioslices(writer, &iov)?;
4725    }
4726    Ok(())
4727}
4728
4729/// Delete + squeeze from mmap'd byte slice.
4730///
4731/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
4732/// For data > 16MB: chunked approach to limit memory.
4733pub fn delete_squeeze_mmap(
4734    delete_chars: &[u8],
4735    squeeze_chars: &[u8],
4736    data: &[u8],
4737    writer: &mut impl Write,
4738) -> io::Result<()> {
4739    let delete_set = build_member_set(delete_chars);
4740    let squeeze_set = build_member_set(squeeze_chars);
4741
4742    // Single-allocation delete+squeeze: full-size buffer, single write_all.
4743    let mut outbuf = alloc_uninit_vec(data.len());
4744    let mut last_squeezed: u16 = 256;
4745    let mut out_pos = 0;
4746
4747    for &b in data.iter() {
4748        if is_member(&delete_set, b) {
4749            continue;
4750        }
4751        if is_member(&squeeze_set, b) {
4752            if last_squeezed == b as u16 {
4753                continue;
4754            }
4755            last_squeezed = b as u16;
4756        } else {
4757            last_squeezed = 256;
4758        }
4759        unsafe {
4760            *outbuf.get_unchecked_mut(out_pos) = b;
4761        }
4762        out_pos += 1;
4763    }
4764    writer.write_all(&outbuf[..out_pos])
4765}
4766
4767/// Squeeze from mmap'd byte slice.
4768///
4769/// For data >= 2MB: uses rayon parallel processing with boundary fixup.
4770/// For data <= 16MB: squeeze into one buffer, one write syscall.
4771/// For data > 16MB: chunked approach to limit memory.
4772pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4773    if squeeze_chars.len() == 1 {
4774        return squeeze_single_mmap(squeeze_chars[0], data, writer);
4775    }
4776    if squeeze_chars.len() == 2 {
4777        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4778    }
4779    if squeeze_chars.len() == 3 {
4780        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4781    }
4782
4783    let member = build_member_set(squeeze_chars);
4784
4785    // Parallel path: squeeze each chunk independently, then fix boundaries
4786    if data.len() >= PARALLEL_THRESHOLD {
4787        let n_threads = rayon::current_num_threads().max(1);
4788        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4789
4790        let results: Vec<Vec<u8>> = data
4791            .par_chunks(chunk_size)
4792            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4793            .collect();
4794
4795        // Build IoSlice list, fixing boundaries: if chunk N ends with byte B
4796        // and chunk N+1 starts with same byte B, and B is in squeeze set,
4797        // skip the first byte(s) of chunk N+1 that equal B.
4798        // Collect slices for writev to minimize syscalls.
4799        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4800        for (idx, result) in results.iter().enumerate() {
4801            if result.is_empty() {
4802                continue;
4803            }
4804            if idx > 0 {
4805                // Check boundary: does previous chunk end with same squeezable byte?
4806                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4807                    if is_member(&member, prev_last) {
4808                        // Skip leading bytes in this chunk that equal prev_last
4809                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
4810                        if skip < result.len() {
4811                            slices.push(std::io::IoSlice::new(&result[skip..]));
4812                        }
4813                        continue;
4814                    }
4815                }
4816            }
4817            slices.push(std::io::IoSlice::new(result));
4818        }
4819        return write_ioslices(writer, &slices);
4820    }
4821
4822    // Single-allocation squeeze: full-size buffer, single write_all.
4823    let mut outbuf = alloc_uninit_vec(data.len());
4824    let len = data.len();
4825    let mut wp = 0;
4826    let mut i = 0;
4827    let mut last_squeezed: u16 = 256;
4828
4829    unsafe {
4830        let inp = data.as_ptr();
4831        let outp = outbuf.as_mut_ptr();
4832
4833        while i < len {
4834            let b = *inp.add(i);
4835            if is_member(&member, b) {
4836                if last_squeezed != b as u16 {
4837                    *outp.add(wp) = b;
4838                    wp += 1;
4839                    last_squeezed = b as u16;
4840                }
4841                i += 1;
4842                while i < len && *inp.add(i) == b {
4843                    i += 1;
4844                }
4845            } else {
4846                last_squeezed = 256;
4847                *outp.add(wp) = b;
4848                wp += 1;
4849                i += 1;
4850            }
4851        }
4852    }
4853    writer.write_all(&outbuf[..wp])
4854}
4855
4856/// Squeeze a single chunk using bitset membership. Returns squeezed output.
4857fn squeeze_chunk_bitset(chunk: &[u8], member: &[u8; 32]) -> Vec<u8> {
4858    let len = chunk.len();
4859    let mut out = Vec::with_capacity(len);
4860    let mut last_squeezed: u16 = 256;
4861    let mut i = 0;
4862
4863    unsafe {
4864        out.set_len(len);
4865        let inp = chunk.as_ptr();
4866        let outp: *mut u8 = out.as_mut_ptr();
4867        let mut wp = 0;
4868
4869        while i < len {
4870            let b = *inp.add(i);
4871            if is_member(member, b) {
4872                if last_squeezed != b as u16 {
4873                    *outp.add(wp) = b;
4874                    wp += 1;
4875                    last_squeezed = b as u16;
4876                }
4877                i += 1;
4878                while i < len && *inp.add(i) == b {
4879                    i += 1;
4880                }
4881            } else {
4882                last_squeezed = 256;
4883                *outp.add(wp) = b;
4884                wp += 1;
4885                i += 1;
4886            }
4887        }
4888        out.set_len(wp);
4889    }
4890    out
4891}
4892
4893fn squeeze_multi_mmap<const N: usize>(
4894    chars: &[u8],
4895    data: &[u8],
4896    writer: &mut impl Write,
4897) -> io::Result<()> {
4898    // Parallel path for large data: squeeze each chunk, fix boundaries with writev
4899    if data.len() >= PARALLEL_THRESHOLD {
4900        let member = build_member_set(chars);
4901        let n_threads = rayon::current_num_threads().max(1);
4902        let chunk_size = (data.len() / n_threads).max(32 * 1024);
4903
4904        let results: Vec<Vec<u8>> = data
4905            .par_chunks(chunk_size)
4906            .map(|chunk| squeeze_chunk_bitset(chunk, &member))
4907            .collect();
4908
4909        // Build IoSlice list, fixing boundaries
4910        let mut slices: Vec<std::io::IoSlice> = Vec::with_capacity(results.len());
4911        for (idx, result) in results.iter().enumerate() {
4912            if result.is_empty() {
4913                continue;
4914            }
4915            if idx > 0 {
4916                if let Some(&prev_last) = results[..idx].iter().rev().find_map(|r| r.last()) {
4917                    if is_member(&member, prev_last) {
4918                        let skip = result.iter().take_while(|&&b| b == prev_last).count();
4919                        if skip < result.len() {
4920                            slices.push(std::io::IoSlice::new(&result[skip..]));
4921                        }
4922                        continue;
4923                    }
4924                }
4925            }
4926            slices.push(std::io::IoSlice::new(result));
4927        }
4928        return write_ioslices(writer, &slices);
4929    }
4930
4931    // Zero-copy writev: build IoSlice entries pointing directly into
4932    // the original mmap'd data, keeping one byte per run of squeezable chars.
4933    // Each IoSlice points at the gap between squeeze points (inclusive of
4934    // the first byte of a run) — no data is copied.
4935    let single = [chars[0]; 1]; // scratch for emitting single squeeze byte
4936    let _ = single;
4937    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4938    let mut cursor = 0;
4939    let mut last_squeezed: u16 = 256;
4940
4941    macro_rules! find_next {
4942        ($data:expr) => {
4943            if N == 2 {
4944                memchr::memchr2(chars[0], chars[1], $data)
4945            } else {
4946                memchr::memchr3(chars[0], chars[1], chars[2], $data)
4947            }
4948        };
4949    }
4950
4951    while cursor < data.len() {
4952        match find_next!(&data[cursor..]) {
4953            Some(offset) => {
4954                let pos = cursor + offset;
4955                let b = data[pos];
4956                // Emit gap before squeeze point
4957                if pos > cursor {
4958                    iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4959                    last_squeezed = 256;
4960                }
4961                // Emit single byte if not duplicate
4962                if last_squeezed != b as u16 {
4963                    // Point at the byte in the original data (zero-copy)
4964                    iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4965                    last_squeezed = b as u16;
4966                }
4967                // Skip the run of same byte
4968                let mut skip = pos + 1;
4969                while skip < data.len() && data[skip] == b {
4970                    skip += 1;
4971                }
4972                cursor = skip;
4973                // Flush when approaching MAX_IOV
4974                if iov.len() >= MAX_IOV {
4975                    write_ioslices(writer, &iov)?;
4976                    iov.clear();
4977                }
4978            }
4979            None => {
4980                if cursor < data.len() {
4981                    iov.push(std::io::IoSlice::new(&data[cursor..]));
4982                }
4983                break;
4984            }
4985        }
4986    }
4987    if !iov.is_empty() {
4988        write_ioslices(writer, &iov)?;
4989    }
4990    Ok(())
4991}
4992
4993fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4994    if data.is_empty() {
4995        return Ok(());
4996    }
4997
4998    // Quick check: no consecutive pairs means no squeezing needed
4999    let pair = [ch, ch];
5000    if memchr::memmem::find(data, &pair).is_none() {
5001        return writer.write_all(data);
5002    }
5003
5004    // Zero-copy writev approach: build IoSlice entries pointing directly into
5005    // the original mmap'd data, skipping duplicate bytes in runs.
5006    // For `tr -s ' '` on 10MB with ~5K squeeze points:
5007    //   - ~10K IoSlice entries (one per gap + one per squeeze point)
5008    //   - ~10 writev syscalls (at 1024 entries per batch)
5009    //   - Zero data copy — kernel reads directly from mmap pages
5010    let finder = memchr::memmem::Finder::new(&pair);
5011    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
5012    let mut cursor = 0;
5013
5014    while cursor < data.len() {
5015        match finder.find(&data[cursor..]) {
5016            Some(offset) => {
5017                let pair_pos = cursor + offset;
5018                // Include everything up to and including the first byte of the pair
5019                let seg_end = pair_pos + 1;
5020                if seg_end > cursor {
5021                    iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
5022                }
5023                // Skip all remaining consecutive ch bytes (the run)
5024                let mut skip = seg_end;
5025                while skip < data.len() && data[skip] == ch {
5026                    skip += 1;
5027                }
5028                cursor = skip;
5029                // Flush when approaching MAX_IOV
5030                if iov.len() >= MAX_IOV {
5031                    write_ioslices(writer, &iov)?;
5032                    iov.clear();
5033                }
5034            }
5035            None => {
5036                // No more pairs — emit remainder
5037                if cursor < data.len() {
5038                    iov.push(std::io::IoSlice::new(&data[cursor..]));
5039                }
5040                break;
5041            }
5042        }
5043    }
5044
5045    if !iov.is_empty() {
5046        write_ioslices(writer, &iov)?;
5047    }
5048    Ok(())
5049}