Skip to main content

coreutils_rs/tr/
core.rs

1use std::io::{self, Read, Write};
2
3/// Maximum IoSlice entries per write_vectored batch.
4/// Linux UIO_MAXIOV is 1024; we use that as our batch limit.
5const MAX_IOV: usize = 1024;
6
7/// Stream buffer: 2MB — amortises read()/write() syscall overhead (4x fewer calls vs 512KB).
8const STREAM_BUF: usize = 2 * 1024 * 1024;
9
10/// Maximum data size for a single full-size output allocation.
11/// Files larger than this fall back to the chunked approach to avoid OOM.
12const SINGLE_ALLOC_LIMIT: usize = 512 * 1024 * 1024;
13
14/// 256-entry lookup table for byte compaction: for each 8-bit keep mask,
15/// stores the bit positions of set bits (indices of bytes to keep).
16/// Used by compact_8bytes to replace the serial trailing_zeros loop with
17/// unconditional indexed stores, eliminating the tzcnt→blsr dependency chain.
18/// Total size: 256 * 8 = 2KB — fits entirely in L1 cache.
19#[cfg(target_arch = "x86_64")]
20static COMPACT_LUT: [[u8; 8]; 256] = {
21    let mut lut = [[0u8; 8]; 256];
22    let mut mask: u16 = 0;
23    while mask < 256 {
24        let mut idx: usize = 0;
25        let mut bit: u8 = 0;
26        while bit < 8 {
27            if (mask >> bit) & 1 != 0 {
28                lut[mask as usize][idx] = bit;
29                idx += 1;
30            }
31            bit += 1;
32        }
33        mask += 1;
34    }
35    lut
36};
37
38/// Write multiple IoSlice buffers using write_vectored, batching into MAX_IOV-sized groups.
39/// Falls back to write_all per slice for partial writes.
40#[inline]
41fn write_ioslices(writer: &mut impl Write, slices: &[std::io::IoSlice]) -> io::Result<()> {
42    if slices.is_empty() {
43        return Ok(());
44    }
45    for batch in slices.chunks(MAX_IOV) {
46        let total: usize = batch.iter().map(|s| s.len()).sum();
47        match writer.write_vectored(batch) {
48            Ok(n) if n >= total => continue,
49            Ok(mut written) => {
50                // Partial write: fall back to write_all per remaining slice
51                for slice in batch {
52                    let slen = slice.len();
53                    if written >= slen {
54                        written -= slen;
55                        continue;
56                    }
57                    if written > 0 {
58                        writer.write_all(&slice[written..])?;
59                        written = 0;
60                    } else {
61                        writer.write_all(slice)?;
62                    }
63                }
64            }
65            Err(e) => return Err(e),
66        }
67    }
68    Ok(())
69}
70
71/// Allocate a Vec<u8> of given length without zero-initialization.
72/// Uses MADV_HUGEPAGE on Linux for buffers >= 2MB to reduce TLB misses.
73/// SAFETY: Caller must write all bytes before reading them.
74#[inline]
75#[allow(clippy::uninit_vec)]
76fn alloc_uninit_vec(len: usize) -> Vec<u8> {
77    let mut v = Vec::with_capacity(len);
78    // SAFETY: u8 has no drop, no invalid bit patterns; caller will overwrite before reading
79    unsafe {
80        v.set_len(len);
81    }
82    #[cfg(target_os = "linux")]
83    if len >= 2 * 1024 * 1024 {
84        unsafe {
85            libc::madvise(
86                v.as_mut_ptr() as *mut libc::c_void,
87                len,
88                libc::MADV_HUGEPAGE,
89            );
90        }
91    }
92    v
93}
94
95/// Build a 256-byte lookup table mapping set1[i] -> set2[i].
96#[inline]
97fn build_translate_table(set1: &[u8], set2: &[u8]) -> [u8; 256] {
98    let mut table: [u8; 256] = std::array::from_fn(|i| i as u8);
99    let last = set2.last().copied();
100    for (i, &from) in set1.iter().enumerate() {
101        table[from as usize] = if i < set2.len() {
102            set2[i]
103        } else {
104            last.unwrap_or(from)
105        };
106    }
107    table
108}
109
110/// Build a 256-bit (32-byte) membership set for O(1) byte lookup.
111#[inline]
112fn build_member_set(chars: &[u8]) -> [u8; 32] {
113    let mut set = [0u8; 32];
114    for &ch in chars {
115        set[ch as usize >> 3] |= 1 << (ch & 7);
116    }
117    set
118}
119
120#[inline(always)]
121fn is_member(set: &[u8; 32], ch: u8) -> bool {
122    unsafe { (*set.get_unchecked(ch as usize >> 3) & (1 << (ch & 7))) != 0 }
123}
124
125/// Cached SIMD capability level for x86_64.
126/// 0 = unchecked, 1 = scalar only, 2 = SSSE3, 3 = AVX2
127#[cfg(target_arch = "x86_64")]
128static SIMD_LEVEL: std::sync::atomic::AtomicU8 = std::sync::atomic::AtomicU8::new(0);
129
130#[cfg(target_arch = "x86_64")]
131#[inline(always)]
132fn get_simd_level() -> u8 {
133    let level = SIMD_LEVEL.load(std::sync::atomic::Ordering::Relaxed);
134    if level != 0 {
135        return level;
136    }
137    let detected = if is_x86_feature_detected!("avx2") {
138        3
139    } else if is_x86_feature_detected!("ssse3") {
140        2
141    } else {
142        1
143    };
144    SIMD_LEVEL.store(detected, std::sync::atomic::Ordering::Relaxed);
145    detected
146}
147
148/// Count how many entries in the translate table are non-identity.
149#[cfg(target_arch = "x86_64")]
150#[inline]
151fn count_non_identity(table: &[u8; 256]) -> usize {
152    table
153        .iter()
154        .enumerate()
155        .filter(|&(i, &v)| v != i as u8)
156        .count()
157}
158
159/// Translate bytes in-place using a 256-byte lookup table.
160/// For sparse translations (few bytes change), uses SIMD skip-ahead:
161/// compare 32 bytes at a time against identity, skip unchanged chunks.
162/// For dense translations, uses full SIMD nibble decomposition.
163/// Falls back to 8x-unrolled scalar on non-x86_64 platforms.
164#[inline(always)]
165fn translate_inplace(data: &mut [u8], table: &[u8; 256]) {
166    #[cfg(target_arch = "x86_64")]
167    {
168        let level = get_simd_level();
169        if level >= 3 {
170            // For sparse translations (<=16 non-identity entries), the skip-ahead
171            // approach is faster: load 32 bytes, do a full nibble lookup, compare
172            // against input, skip store if identical. This avoids writing to pages
173            // that don't change (important for MAP_PRIVATE COW mmap).
174            let non_id = count_non_identity(table);
175            if non_id > 0 && non_id <= 16 {
176                unsafe { translate_inplace_avx2_sparse(data, table) };
177                return;
178            }
179            unsafe { translate_inplace_avx2_table(data, table) };
180            return;
181        }
182        if level >= 2 {
183            unsafe { translate_inplace_ssse3_table(data, table) };
184            return;
185        }
186    }
187    translate_inplace_scalar(data, table);
188}
189
190/// Sparse AVX2 translate: skip unchanged 32-byte chunks.
191/// For each chunk: perform full nibble lookup, compare result vs input.
192/// If identical (no bytes changed), skip the store entirely.
193/// This reduces memory bandwidth and avoids COW page faults for
194/// MAP_PRIVATE mmaps when most bytes are unchanged.
195#[cfg(target_arch = "x86_64")]
196#[target_feature(enable = "avx2")]
197unsafe fn translate_inplace_avx2_sparse(data: &mut [u8], table: &[u8; 256]) {
198    use std::arch::x86_64::*;
199
200    unsafe {
201        let len = data.len();
202        let ptr = data.as_mut_ptr();
203
204        // Pre-build 16 lookup vectors (same as full nibble decomposition)
205        let mut lut = [_mm256_setzero_si256(); 16];
206        for h in 0u8..16 {
207            let base = (h as usize) * 16;
208            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
209            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
210            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
211        }
212
213        let lo_mask = _mm256_set1_epi8(0x0F);
214
215        let mut i = 0;
216        while i + 32 <= len {
217            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
218            let lo_nibble = _mm256_and_si256(input, lo_mask);
219            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
220
221            let mut result = _mm256_setzero_si256();
222            macro_rules! do_nibble {
223                ($h:expr) => {
224                    let h_val = _mm256_set1_epi8($h as i8);
225                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
226                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
227                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
228                };
229            }
230            do_nibble!(0);
231            do_nibble!(1);
232            do_nibble!(2);
233            do_nibble!(3);
234            do_nibble!(4);
235            do_nibble!(5);
236            do_nibble!(6);
237            do_nibble!(7);
238            do_nibble!(8);
239            do_nibble!(9);
240            do_nibble!(10);
241            do_nibble!(11);
242            do_nibble!(12);
243            do_nibble!(13);
244            do_nibble!(14);
245            do_nibble!(15);
246
247            // Only store if result differs from input (skip unchanged chunks)
248            let diff = _mm256_xor_si256(input, result);
249            if _mm256_testz_si256(diff, diff) == 0 {
250                _mm256_storeu_si256(ptr.add(i) as *mut _, result);
251            }
252            i += 32;
253        }
254
255        // Scalar tail
256        while i < len {
257            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
258            i += 1;
259        }
260    }
261}
262
263/// Scalar fallback: 8x-unrolled table lookup.
264#[cfg(not(target_arch = "aarch64"))]
265#[inline(always)]
266fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
267    let len = data.len();
268    let ptr = data.as_mut_ptr();
269    let mut i = 0;
270    unsafe {
271        while i + 8 <= len {
272            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
273            *ptr.add(i + 1) = *table.get_unchecked(*ptr.add(i + 1) as usize);
274            *ptr.add(i + 2) = *table.get_unchecked(*ptr.add(i + 2) as usize);
275            *ptr.add(i + 3) = *table.get_unchecked(*ptr.add(i + 3) as usize);
276            *ptr.add(i + 4) = *table.get_unchecked(*ptr.add(i + 4) as usize);
277            *ptr.add(i + 5) = *table.get_unchecked(*ptr.add(i + 5) as usize);
278            *ptr.add(i + 6) = *table.get_unchecked(*ptr.add(i + 6) as usize);
279            *ptr.add(i + 7) = *table.get_unchecked(*ptr.add(i + 7) as usize);
280            i += 8;
281        }
282        while i < len {
283            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
284            i += 1;
285        }
286    }
287}
288
289/// ARM64 NEON table lookup using nibble decomposition (same algorithm as x86 pshufb).
290/// Uses vqtbl1q_u8 for 16-byte table lookups, processes 16 bytes per iteration.
291#[cfg(target_arch = "aarch64")]
292#[inline(always)]
293fn translate_inplace_scalar(data: &mut [u8], table: &[u8; 256]) {
294    unsafe { translate_inplace_neon_table(data, table) };
295}
296
297#[cfg(target_arch = "aarch64")]
298#[target_feature(enable = "neon")]
299unsafe fn translate_inplace_neon_table(data: &mut [u8], table: &[u8; 256]) {
300    use std::arch::aarch64::*;
301
302    unsafe {
303        let len = data.len();
304        let ptr = data.as_mut_ptr();
305
306        // Pre-build 16 NEON lookup vectors (one per high nibble)
307        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
308        for h in 0u8..16 {
309            let base = (h as usize) * 16;
310            lut[h as usize] = vld1q_u8(table.as_ptr().add(base));
311        }
312
313        let lo_mask = vdupq_n_u8(0x0F);
314        let mut i = 0;
315
316        while i + 16 <= len {
317            let input = vld1q_u8(ptr.add(i));
318            let lo_nibble = vandq_u8(input, lo_mask);
319            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
320
321            let mut result = vdupq_n_u8(0);
322            macro_rules! do_nibble {
323                ($h:expr) => {
324                    let h_val = vdupq_n_u8($h);
325                    let mask = vceqq_u8(hi_nibble, h_val);
326                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
327                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
328                };
329            }
330            do_nibble!(0);
331            do_nibble!(1);
332            do_nibble!(2);
333            do_nibble!(3);
334            do_nibble!(4);
335            do_nibble!(5);
336            do_nibble!(6);
337            do_nibble!(7);
338            do_nibble!(8);
339            do_nibble!(9);
340            do_nibble!(10);
341            do_nibble!(11);
342            do_nibble!(12);
343            do_nibble!(13);
344            do_nibble!(14);
345            do_nibble!(15);
346
347            vst1q_u8(ptr.add(i), result);
348            i += 16;
349        }
350
351        // Scalar tail
352        while i < len {
353            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
354            i += 1;
355        }
356    }
357}
358
359// ============================================================================
360// SIMD arbitrary table lookup using pshufb nibble decomposition (x86_64)
361// ============================================================================
362//
363// For an arbitrary 256-byte lookup table, we decompose each byte into
364// high nibble (bits 7-4) and low nibble (bits 3-0). We pre-build 16
365// SIMD vectors, one for each high nibble value h (0..15), containing
366// the 16 table entries table[h*16+0..h*16+15]. Then for each input
367// vector we:
368//   1. Extract low nibble (AND 0x0F) -> used as pshufb index
369//   2. Extract high nibble (shift right 4) -> used to select which table
370//   3. For each of the 16 high nibble values, create a mask where
371//      the high nibble equals that value, pshufb the corresponding
372//      table, and accumulate results
373//
374// AVX2 processes 32 bytes/iteration; SSSE3 processes 16 bytes/iteration.
375// With instruction-level parallelism, this achieves much higher throughput
376// than scalar table lookups which have serial data dependencies.
377
378#[cfg(target_arch = "x86_64")]
379#[target_feature(enable = "avx2")]
380unsafe fn translate_inplace_avx2_table(data: &mut [u8], table: &[u8; 256]) {
381    use std::arch::x86_64::*;
382
383    unsafe {
384        let len = data.len();
385        let ptr = data.as_mut_ptr();
386
387        // Pre-build 16 lookup vectors, one per high nibble value.
388        // Each vector holds 32 bytes = 2x 128-bit lanes, each lane has the same
389        // 16 table entries for pshufb indexing by low nibble.
390        let mut lut = [_mm256_setzero_si256(); 16];
391        for h in 0u8..16 {
392            let base = (h as usize) * 16;
393            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
394            // Broadcast the 128-bit row to both lanes of the 256-bit vector
395            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
396            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
397        }
398
399        let lo_mask = _mm256_set1_epi8(0x0F);
400
401        let mut i = 0;
402
403        // 2x unrolled: process 64 bytes (2x32) per iteration for better ILP.
404        // The CPU can overlap load/compute of the second vector while the first
405        // is in the nibble decomposition pipeline.
406        while i + 64 <= len {
407            let input0 = _mm256_loadu_si256(ptr.add(i) as *const _);
408            let input1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
409
410            let lo0 = _mm256_and_si256(input0, lo_mask);
411            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
412            let lo1 = _mm256_and_si256(input1, lo_mask);
413            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
414
415            let mut r0 = _mm256_setzero_si256();
416            let mut r1 = _mm256_setzero_si256();
417
418            macro_rules! do_nibble2 {
419                ($h:expr) => {
420                    let h_val = _mm256_set1_epi8($h as i8);
421                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
422                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
423                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
424                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
425                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
426                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
427                };
428            }
429            do_nibble2!(0);
430            do_nibble2!(1);
431            do_nibble2!(2);
432            do_nibble2!(3);
433            do_nibble2!(4);
434            do_nibble2!(5);
435            do_nibble2!(6);
436            do_nibble2!(7);
437            do_nibble2!(8);
438            do_nibble2!(9);
439            do_nibble2!(10);
440            do_nibble2!(11);
441            do_nibble2!(12);
442            do_nibble2!(13);
443            do_nibble2!(14);
444            do_nibble2!(15);
445
446            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
447            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
448            i += 64;
449        }
450
451        // Remaining 32-byte chunk
452        if i + 32 <= len {
453            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
454            let lo_nibble = _mm256_and_si256(input, lo_mask);
455            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
456
457            let mut result = _mm256_setzero_si256();
458
459            macro_rules! do_nibble {
460                ($h:expr) => {
461                    let h_val = _mm256_set1_epi8($h as i8);
462                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
463                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
464                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
465                };
466            }
467            do_nibble!(0);
468            do_nibble!(1);
469            do_nibble!(2);
470            do_nibble!(3);
471            do_nibble!(4);
472            do_nibble!(5);
473            do_nibble!(6);
474            do_nibble!(7);
475            do_nibble!(8);
476            do_nibble!(9);
477            do_nibble!(10);
478            do_nibble!(11);
479            do_nibble!(12);
480            do_nibble!(13);
481            do_nibble!(14);
482            do_nibble!(15);
483
484            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
485            i += 32;
486        }
487
488        // SSE/SSSE3 tail for remaining 16-byte chunk
489        if i + 16 <= len {
490            let lo_mask128 = _mm_set1_epi8(0x0F);
491
492            let mut lut128 = [_mm_setzero_si128(); 16];
493            for h in 0u8..16 {
494                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
495            }
496
497            let input = _mm_loadu_si128(ptr.add(i) as *const _);
498            let lo_nib = _mm_and_si128(input, lo_mask128);
499            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
500
501            let mut res = _mm_setzero_si128();
502            macro_rules! do_nibble128 {
503                ($h:expr) => {
504                    let h_val = _mm_set1_epi8($h as i8);
505                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
506                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
507                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
508                };
509            }
510            do_nibble128!(0);
511            do_nibble128!(1);
512            do_nibble128!(2);
513            do_nibble128!(3);
514            do_nibble128!(4);
515            do_nibble128!(5);
516            do_nibble128!(6);
517            do_nibble128!(7);
518            do_nibble128!(8);
519            do_nibble128!(9);
520            do_nibble128!(10);
521            do_nibble128!(11);
522            do_nibble128!(12);
523            do_nibble128!(13);
524            do_nibble128!(14);
525            do_nibble128!(15);
526
527            _mm_storeu_si128(ptr.add(i) as *mut _, res);
528            i += 16;
529        }
530
531        // Scalar tail
532        while i < len {
533            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
534            i += 1;
535        }
536    }
537}
538
539#[cfg(target_arch = "x86_64")]
540#[target_feature(enable = "ssse3")]
541unsafe fn translate_inplace_ssse3_table(data: &mut [u8], table: &[u8; 256]) {
542    use std::arch::x86_64::*;
543
544    unsafe {
545        let len = data.len();
546        let ptr = data.as_mut_ptr();
547
548        // Pre-build 16 lookup vectors for pshufb
549        let mut lut = [_mm_setzero_si128(); 16];
550        for h in 0u8..16 {
551            let base = (h as usize) * 16;
552            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
553            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
554        }
555
556        let lo_mask = _mm_set1_epi8(0x0F);
557
558        let mut i = 0;
559        while i + 16 <= len {
560            let input = _mm_loadu_si128(ptr.add(i) as *const _);
561            let lo_nibble = _mm_and_si128(input, lo_mask);
562            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
563
564            let mut result = _mm_setzero_si128();
565
566            macro_rules! do_nibble {
567                ($h:expr) => {
568                    let h_val = _mm_set1_epi8($h as i8);
569                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
570                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
571                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
572                };
573            }
574            do_nibble!(0);
575            do_nibble!(1);
576            do_nibble!(2);
577            do_nibble!(3);
578            do_nibble!(4);
579            do_nibble!(5);
580            do_nibble!(6);
581            do_nibble!(7);
582            do_nibble!(8);
583            do_nibble!(9);
584            do_nibble!(10);
585            do_nibble!(11);
586            do_nibble!(12);
587            do_nibble!(13);
588            do_nibble!(14);
589            do_nibble!(15);
590
591            _mm_storeu_si128(ptr.add(i) as *mut _, result);
592            i += 16;
593        }
594
595        // Scalar tail
596        while i < len {
597            *ptr.add(i) = *table.get_unchecked(*ptr.add(i) as usize);
598            i += 1;
599        }
600    }
601}
602
603/// Translate bytes from source to destination using a 256-byte lookup table.
604/// On x86_64 with SSSE3+, uses SIMD pshufb-based nibble decomposition.
605#[inline(always)]
606fn translate_to(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
607    debug_assert!(dst.len() >= src.len());
608    #[cfg(target_arch = "x86_64")]
609    {
610        let level = get_simd_level();
611        if level >= 3 {
612            // Use nontemporal stores when dst is 32-byte aligned (large Vec allocations)
613            if dst.as_ptr() as usize & 31 == 0 {
614                unsafe { translate_to_avx2_table_nt(src, dst, table) };
615            } else {
616                unsafe { translate_to_avx2_table(src, dst, table) };
617            }
618            return;
619        }
620        if level >= 2 {
621            unsafe { translate_to_ssse3_table(src, dst, table) };
622            return;
623        }
624    }
625    translate_to_scalar(src, dst, table);
626}
627
628/// Scalar fallback for translate_to.
629#[cfg(not(target_arch = "aarch64"))]
630#[inline(always)]
631fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
632    unsafe {
633        let sp = src.as_ptr();
634        let dp = dst.as_mut_ptr();
635        let len = src.len();
636        let mut i = 0;
637        while i + 8 <= len {
638            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
639            *dp.add(i + 1) = *table.get_unchecked(*sp.add(i + 1) as usize);
640            *dp.add(i + 2) = *table.get_unchecked(*sp.add(i + 2) as usize);
641            *dp.add(i + 3) = *table.get_unchecked(*sp.add(i + 3) as usize);
642            *dp.add(i + 4) = *table.get_unchecked(*sp.add(i + 4) as usize);
643            *dp.add(i + 5) = *table.get_unchecked(*sp.add(i + 5) as usize);
644            *dp.add(i + 6) = *table.get_unchecked(*sp.add(i + 6) as usize);
645            *dp.add(i + 7) = *table.get_unchecked(*sp.add(i + 7) as usize);
646            i += 8;
647        }
648        while i < len {
649            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
650            i += 1;
651        }
652    }
653}
654
655/// ARM64 NEON table-lookup translate_to using nibble decomposition.
656#[cfg(target_arch = "aarch64")]
657#[inline(always)]
658fn translate_to_scalar(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
659    unsafe { translate_to_neon_table(src, dst, table) };
660}
661
662#[cfg(target_arch = "aarch64")]
663#[target_feature(enable = "neon")]
664unsafe fn translate_to_neon_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
665    use std::arch::aarch64::*;
666
667    unsafe {
668        let len = src.len();
669        let sp = src.as_ptr();
670        let dp = dst.as_mut_ptr();
671
672        let mut lut: [uint8x16_t; 16] = [vdupq_n_u8(0); 16];
673        for h in 0u8..16 {
674            lut[h as usize] = vld1q_u8(table.as_ptr().add((h as usize) * 16));
675        }
676
677        let lo_mask = vdupq_n_u8(0x0F);
678        let mut i = 0;
679
680        while i + 16 <= len {
681            let input = vld1q_u8(sp.add(i));
682            let lo_nibble = vandq_u8(input, lo_mask);
683            let hi_nibble = vandq_u8(vshrq_n_u8(input, 4), lo_mask);
684
685            let mut result = vdupq_n_u8(0);
686            macro_rules! do_nibble {
687                ($h:expr) => {
688                    let h_val = vdupq_n_u8($h);
689                    let mask = vceqq_u8(hi_nibble, h_val);
690                    let looked_up = vqtbl1q_u8(lut[$h as usize], lo_nibble);
691                    result = vorrq_u8(result, vandq_u8(mask, looked_up));
692                };
693            }
694            do_nibble!(0);
695            do_nibble!(1);
696            do_nibble!(2);
697            do_nibble!(3);
698            do_nibble!(4);
699            do_nibble!(5);
700            do_nibble!(6);
701            do_nibble!(7);
702            do_nibble!(8);
703            do_nibble!(9);
704            do_nibble!(10);
705            do_nibble!(11);
706            do_nibble!(12);
707            do_nibble!(13);
708            do_nibble!(14);
709            do_nibble!(15);
710
711            vst1q_u8(dp.add(i), result);
712            i += 16;
713        }
714
715        while i < len {
716            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
717            i += 1;
718        }
719    }
720}
721
722#[cfg(target_arch = "x86_64")]
723#[target_feature(enable = "avx2")]
724unsafe fn translate_to_avx2_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
725    use std::arch::x86_64::*;
726
727    unsafe {
728        let len = src.len();
729        let sp = src.as_ptr();
730        let dp = dst.as_mut_ptr();
731
732        // Pre-build 16 lookup vectors
733        let mut lut = [_mm256_setzero_si256(); 16];
734        for h in 0u8..16 {
735            let base = (h as usize) * 16;
736            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
737            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
738            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
739        }
740
741        let lo_mask = _mm256_set1_epi8(0x0F);
742
743        let mut i = 0;
744
745        // 2x unrolled: process 64 bytes per iteration for better ILP
746        while i + 64 <= len {
747            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
748            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
749
750            let lo0 = _mm256_and_si256(input0, lo_mask);
751            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
752            let lo1 = _mm256_and_si256(input1, lo_mask);
753            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
754
755            let mut r0 = _mm256_setzero_si256();
756            let mut r1 = _mm256_setzero_si256();
757
758            macro_rules! do_nibble2 {
759                ($h:expr) => {
760                    let h_val = _mm256_set1_epi8($h as i8);
761                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
762                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
763                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
764                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
765                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
766                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
767                };
768            }
769            do_nibble2!(0);
770            do_nibble2!(1);
771            do_nibble2!(2);
772            do_nibble2!(3);
773            do_nibble2!(4);
774            do_nibble2!(5);
775            do_nibble2!(6);
776            do_nibble2!(7);
777            do_nibble2!(8);
778            do_nibble2!(9);
779            do_nibble2!(10);
780            do_nibble2!(11);
781            do_nibble2!(12);
782            do_nibble2!(13);
783            do_nibble2!(14);
784            do_nibble2!(15);
785
786            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
787            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
788            i += 64;
789        }
790
791        // Remaining 32-byte chunk
792        if i + 32 <= len {
793            let input = _mm256_loadu_si256(sp.add(i) as *const _);
794            let lo_nibble = _mm256_and_si256(input, lo_mask);
795            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
796
797            let mut result = _mm256_setzero_si256();
798
799            macro_rules! do_nibble {
800                ($h:expr) => {
801                    let h_val = _mm256_set1_epi8($h as i8);
802                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
803                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
804                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
805                };
806            }
807            do_nibble!(0);
808            do_nibble!(1);
809            do_nibble!(2);
810            do_nibble!(3);
811            do_nibble!(4);
812            do_nibble!(5);
813            do_nibble!(6);
814            do_nibble!(7);
815            do_nibble!(8);
816            do_nibble!(9);
817            do_nibble!(10);
818            do_nibble!(11);
819            do_nibble!(12);
820            do_nibble!(13);
821            do_nibble!(14);
822            do_nibble!(15);
823
824            _mm256_storeu_si256(dp.add(i) as *mut _, result);
825            i += 32;
826        }
827
828        // SSSE3 tail for remaining 16-byte chunk
829        if i + 16 <= len {
830            let lo_mask128 = _mm_set1_epi8(0x0F);
831            let mut lut128 = [_mm_setzero_si128(); 16];
832            for h in 0u8..16 {
833                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
834            }
835
836            let input = _mm_loadu_si128(sp.add(i) as *const _);
837            let lo_nib = _mm_and_si128(input, lo_mask128);
838            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
839
840            let mut res = _mm_setzero_si128();
841            macro_rules! do_nibble128 {
842                ($h:expr) => {
843                    let h_val = _mm_set1_epi8($h as i8);
844                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
845                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
846                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
847                };
848            }
849            do_nibble128!(0);
850            do_nibble128!(1);
851            do_nibble128!(2);
852            do_nibble128!(3);
853            do_nibble128!(4);
854            do_nibble128!(5);
855            do_nibble128!(6);
856            do_nibble128!(7);
857            do_nibble128!(8);
858            do_nibble128!(9);
859            do_nibble128!(10);
860            do_nibble128!(11);
861            do_nibble128!(12);
862            do_nibble128!(13);
863            do_nibble128!(14);
864            do_nibble128!(15);
865
866            _mm_storeu_si128(dp.add(i) as *mut _, res);
867            i += 16;
868        }
869
870        // Scalar tail
871        while i < len {
872            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
873            i += 1;
874        }
875    }
876}
877
878/// Nontemporal variant of translate_to_avx2_table: uses _mm256_stream_si256 for stores.
879/// Avoids RFO cache traffic for the destination buffer in streaming translate operations.
880#[cfg(target_arch = "x86_64")]
881#[target_feature(enable = "avx2")]
882unsafe fn translate_to_avx2_table_nt(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
883    use std::arch::x86_64::*;
884
885    unsafe {
886        let len = src.len();
887        let sp = src.as_ptr();
888        let dp = dst.as_mut_ptr();
889
890        // Pre-build 16 lookup vectors
891        let mut lut = [_mm256_setzero_si256(); 16];
892        for h in 0u8..16 {
893            let base = (h as usize) * 16;
894            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
895            let row128 = _mm_loadu_si128(row.as_ptr() as *const _);
896            lut[h as usize] = _mm256_broadcastsi128_si256(row128);
897        }
898
899        let lo_mask = _mm256_set1_epi8(0x0F);
900        let mut i = 0;
901
902        // 2x unrolled with nontemporal stores
903        while i + 64 <= len {
904            let input0 = _mm256_loadu_si256(sp.add(i) as *const _);
905            let input1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
906
907            let lo0 = _mm256_and_si256(input0, lo_mask);
908            let hi0 = _mm256_and_si256(_mm256_srli_epi16(input0, 4), lo_mask);
909            let lo1 = _mm256_and_si256(input1, lo_mask);
910            let hi1 = _mm256_and_si256(_mm256_srli_epi16(input1, 4), lo_mask);
911
912            let mut r0 = _mm256_setzero_si256();
913            let mut r1 = _mm256_setzero_si256();
914
915            macro_rules! do_nibble2 {
916                ($h:expr) => {
917                    let h_val = _mm256_set1_epi8($h as i8);
918                    let m0 = _mm256_cmpeq_epi8(hi0, h_val);
919                    let l0 = _mm256_shuffle_epi8(lut[$h], lo0);
920                    r0 = _mm256_or_si256(r0, _mm256_and_si256(m0, l0));
921                    let m1 = _mm256_cmpeq_epi8(hi1, h_val);
922                    let l1 = _mm256_shuffle_epi8(lut[$h], lo1);
923                    r1 = _mm256_or_si256(r1, _mm256_and_si256(m1, l1));
924                };
925            }
926            do_nibble2!(0);
927            do_nibble2!(1);
928            do_nibble2!(2);
929            do_nibble2!(3);
930            do_nibble2!(4);
931            do_nibble2!(5);
932            do_nibble2!(6);
933            do_nibble2!(7);
934            do_nibble2!(8);
935            do_nibble2!(9);
936            do_nibble2!(10);
937            do_nibble2!(11);
938            do_nibble2!(12);
939            do_nibble2!(13);
940            do_nibble2!(14);
941            do_nibble2!(15);
942
943            _mm256_stream_si256(dp.add(i) as *mut _, r0);
944            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
945            i += 64;
946        }
947
948        // Remaining 32-byte chunk
949        if i + 32 <= len {
950            let input = _mm256_loadu_si256(sp.add(i) as *const _);
951            let lo_nibble = _mm256_and_si256(input, lo_mask);
952            let hi_nibble = _mm256_and_si256(_mm256_srli_epi16(input, 4), lo_mask);
953
954            let mut result = _mm256_setzero_si256();
955            macro_rules! do_nibble {
956                ($h:expr) => {
957                    let h_val = _mm256_set1_epi8($h as i8);
958                    let mask = _mm256_cmpeq_epi8(hi_nibble, h_val);
959                    let looked_up = _mm256_shuffle_epi8(lut[$h], lo_nibble);
960                    result = _mm256_or_si256(result, _mm256_and_si256(mask, looked_up));
961                };
962            }
963            do_nibble!(0);
964            do_nibble!(1);
965            do_nibble!(2);
966            do_nibble!(3);
967            do_nibble!(4);
968            do_nibble!(5);
969            do_nibble!(6);
970            do_nibble!(7);
971            do_nibble!(8);
972            do_nibble!(9);
973            do_nibble!(10);
974            do_nibble!(11);
975            do_nibble!(12);
976            do_nibble!(13);
977            do_nibble!(14);
978            do_nibble!(15);
979
980            _mm256_stream_si256(dp.add(i) as *mut _, result);
981            i += 32;
982        }
983
984        // SSSE3 tail for remaining 16-byte chunk (regular store)
985        if i + 16 <= len {
986            let lo_mask128 = _mm_set1_epi8(0x0F);
987            let mut lut128 = [_mm_setzero_si128(); 16];
988            for h in 0u8..16 {
989                lut128[h as usize] = _mm256_castsi256_si128(lut[h as usize]);
990            }
991
992            let input = _mm_loadu_si128(sp.add(i) as *const _);
993            let lo_nib = _mm_and_si128(input, lo_mask128);
994            let hi_nib = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask128);
995
996            let mut res = _mm_setzero_si128();
997            macro_rules! do_nibble128 {
998                ($h:expr) => {
999                    let h_val = _mm_set1_epi8($h as i8);
1000                    let mask = _mm_cmpeq_epi8(hi_nib, h_val);
1001                    let looked_up = _mm_shuffle_epi8(lut128[$h], lo_nib);
1002                    res = _mm_or_si128(res, _mm_and_si128(mask, looked_up));
1003                };
1004            }
1005            do_nibble128!(0);
1006            do_nibble128!(1);
1007            do_nibble128!(2);
1008            do_nibble128!(3);
1009            do_nibble128!(4);
1010            do_nibble128!(5);
1011            do_nibble128!(6);
1012            do_nibble128!(7);
1013            do_nibble128!(8);
1014            do_nibble128!(9);
1015            do_nibble128!(10);
1016            do_nibble128!(11);
1017            do_nibble128!(12);
1018            do_nibble128!(13);
1019            do_nibble128!(14);
1020            do_nibble128!(15);
1021
1022            _mm_storeu_si128(dp.add(i) as *mut _, res);
1023            i += 16;
1024        }
1025
1026        // Scalar tail
1027        while i < len {
1028            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1029            i += 1;
1030        }
1031
1032        // Fence: ensure nontemporal stores are visible before write() syscall
1033        _mm_sfence();
1034    }
1035}
1036
1037#[cfg(target_arch = "x86_64")]
1038#[target_feature(enable = "ssse3")]
1039unsafe fn translate_to_ssse3_table(src: &[u8], dst: &mut [u8], table: &[u8; 256]) {
1040    use std::arch::x86_64::*;
1041
1042    unsafe {
1043        let len = src.len();
1044        let sp = src.as_ptr();
1045        let dp = dst.as_mut_ptr();
1046
1047        let mut lut = [_mm_setzero_si128(); 16];
1048        for h in 0u8..16 {
1049            let base = (h as usize) * 16;
1050            let row: [u8; 16] = std::array::from_fn(|i| *table.get_unchecked(base + i));
1051            lut[h as usize] = _mm_loadu_si128(row.as_ptr() as *const _);
1052        }
1053
1054        let lo_mask = _mm_set1_epi8(0x0F);
1055
1056        let mut i = 0;
1057        while i + 16 <= len {
1058            let input = _mm_loadu_si128(sp.add(i) as *const _);
1059            let lo_nibble = _mm_and_si128(input, lo_mask);
1060            let hi_nibble = _mm_and_si128(_mm_srli_epi16(input, 4), lo_mask);
1061
1062            let mut result = _mm_setzero_si128();
1063
1064            macro_rules! do_nibble {
1065                ($h:expr) => {
1066                    let h_val = _mm_set1_epi8($h as i8);
1067                    let mask = _mm_cmpeq_epi8(hi_nibble, h_val);
1068                    let looked_up = _mm_shuffle_epi8(lut[$h], lo_nibble);
1069                    result = _mm_or_si128(result, _mm_and_si128(mask, looked_up));
1070                };
1071            }
1072            do_nibble!(0);
1073            do_nibble!(1);
1074            do_nibble!(2);
1075            do_nibble!(3);
1076            do_nibble!(4);
1077            do_nibble!(5);
1078            do_nibble!(6);
1079            do_nibble!(7);
1080            do_nibble!(8);
1081            do_nibble!(9);
1082            do_nibble!(10);
1083            do_nibble!(11);
1084            do_nibble!(12);
1085            do_nibble!(13);
1086            do_nibble!(14);
1087            do_nibble!(15);
1088
1089            _mm_storeu_si128(dp.add(i) as *mut _, result);
1090            i += 16;
1091        }
1092
1093        // Scalar tail
1094        while i < len {
1095            *dp.add(i) = *table.get_unchecked(*sp.add(i) as usize);
1096            i += 1;
1097        }
1098    }
1099}
1100
1101// ============================================================================
1102// SIMD range translation (x86_64)
1103// ============================================================================
1104
1105/// Detect if the translate table is a single contiguous range with constant offset.
1106/// Returns Some((lo, hi, offset)) if all non-identity entries form [lo..=hi] with
1107/// table[i] = i + offset for all i in [lo, hi].
1108#[inline]
1109fn detect_range_offset(table: &[u8; 256]) -> Option<(u8, u8, i8)> {
1110    let mut lo: Option<u8> = None;
1111    let mut hi = 0u8;
1112    let mut offset = 0i16;
1113
1114    for i in 0..256 {
1115        if table[i] != i as u8 {
1116            let diff = table[i] as i16 - i as i16;
1117            match lo {
1118                None => {
1119                    lo = Some(i as u8);
1120                    hi = i as u8;
1121                    offset = diff;
1122                }
1123                Some(_) => {
1124                    if diff != offset || i as u8 != hi.wrapping_add(1) {
1125                        return None;
1126                    }
1127                    hi = i as u8;
1128                }
1129            }
1130        }
1131    }
1132
1133    lo.map(|l| (l, hi, offset as i8))
1134}
1135
1136/// Detect if the translate table maps a contiguous range [lo..=hi] to a single constant byte,
1137/// and all other bytes are identity. This covers cases like `tr '\000-\037' 'X'` where
1138/// a range maps to one replacement character.
1139/// Returns Some((lo, hi, replacement)) if the pattern matches.
1140#[inline]
1141fn detect_range_to_constant(table: &[u8; 256]) -> Option<(u8, u8, u8)> {
1142    let mut lo: Option<u8> = None;
1143    let mut hi = 0u8;
1144    let mut replacement = 0u8;
1145
1146    for i in 0..256 {
1147        if table[i] != i as u8 {
1148            match lo {
1149                None => {
1150                    lo = Some(i as u8);
1151                    hi = i as u8;
1152                    replacement = table[i];
1153                }
1154                Some(_) => {
1155                    if table[i] != replacement || i as u8 != hi.wrapping_add(1) {
1156                        return None;
1157                    }
1158                    hi = i as u8;
1159                }
1160            }
1161        }
1162    }
1163
1164    lo.map(|l| (l, hi, replacement))
1165}
1166
1167/// SIMD-accelerated range-to-constant translation.
1168/// For tables where a contiguous range [lo..=hi] maps to a single byte, and all
1169/// other bytes are identity. Uses vectorized range check + blend (5 SIMD ops per
1170/// 32 bytes with AVX2, vs 48 for general nibble decomposition).
1171#[cfg(target_arch = "x86_64")]
1172fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1173    if get_simd_level() >= 3 {
1174        unsafe { translate_range_to_constant_avx2_inplace(data, lo, hi, replacement) };
1175    } else {
1176        unsafe { translate_range_to_constant_sse2_inplace(data, lo, hi, replacement) };
1177    }
1178}
1179
1180#[cfg(target_arch = "x86_64")]
1181#[target_feature(enable = "avx2")]
1182unsafe fn translate_range_to_constant_avx2_inplace(
1183    data: &mut [u8],
1184    lo: u8,
1185    hi: u8,
1186    replacement: u8,
1187) {
1188    use std::arch::x86_64::*;
1189
1190    unsafe {
1191        let range = hi - lo;
1192        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1193        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1194        let repl_v = _mm256_set1_epi8(replacement as i8);
1195        let zero = _mm256_setzero_si256();
1196
1197        let len = data.len();
1198        let ptr = data.as_mut_ptr();
1199        let mut i = 0;
1200
1201        // 2x unrolled: process 64 bytes per iteration for better ILP
1202        while i + 64 <= len {
1203            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1204            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1205            let bi0 = _mm256_add_epi8(in0, bias_v);
1206            let bi1 = _mm256_add_epi8(in1, bias_v);
1207            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1208            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1209            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1210            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1211            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1212            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1213            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1214            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1215            i += 64;
1216        }
1217
1218        // Remaining 32-byte chunk
1219        if i + 32 <= len {
1220            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1221            let biased = _mm256_add_epi8(input, bias_v);
1222            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1223            let in_range = _mm256_cmpeq_epi8(gt, zero);
1224            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1225            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1226            i += 32;
1227        }
1228
1229        if i + 16 <= len {
1230            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1231            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1232            let repl_v128 = _mm_set1_epi8(replacement as i8);
1233            let zero128 = _mm_setzero_si128();
1234
1235            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1236            let biased = _mm_add_epi8(input, bias_v128);
1237            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1238            let in_range = _mm_cmpeq_epi8(gt, zero128);
1239            let result = _mm_blendv_epi8(input, repl_v128, in_range);
1240            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1241            i += 16;
1242        }
1243
1244        while i < len {
1245            let b = *ptr.add(i);
1246            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1247            i += 1;
1248        }
1249    }
1250}
1251
1252#[cfg(target_arch = "x86_64")]
1253#[target_feature(enable = "sse2")]
1254unsafe fn translate_range_to_constant_sse2_inplace(
1255    data: &mut [u8],
1256    lo: u8,
1257    hi: u8,
1258    replacement: u8,
1259) {
1260    use std::arch::x86_64::*;
1261
1262    unsafe {
1263        let range = hi - lo;
1264        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1265        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1266        let repl_v = _mm_set1_epi8(replacement as i8);
1267        let zero = _mm_setzero_si128();
1268
1269        let len = data.len();
1270        let ptr = data.as_mut_ptr();
1271        let mut i = 0;
1272
1273        while i + 16 <= len {
1274            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1275            let biased = _mm_add_epi8(input, bias_v);
1276            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1277            // in_range mask: 0xFF where in range, 0x00 where not
1278            let in_range = _mm_cmpeq_epi8(gt, zero);
1279            // SSE2 blendv: (repl & mask) | (input & ~mask)
1280            let result = _mm_or_si128(
1281                _mm_and_si128(in_range, repl_v),
1282                _mm_andnot_si128(in_range, input),
1283            );
1284            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1285            i += 16;
1286        }
1287
1288        while i < len {
1289            let b = *ptr.add(i);
1290            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1291            i += 1;
1292        }
1293    }
1294}
1295
1296#[cfg(target_arch = "aarch64")]
1297fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1298    unsafe { translate_range_to_constant_neon_inplace(data, lo, hi, replacement) };
1299}
1300
1301#[cfg(target_arch = "aarch64")]
1302#[target_feature(enable = "neon")]
1303unsafe fn translate_range_to_constant_neon_inplace(
1304    data: &mut [u8],
1305    lo: u8,
1306    hi: u8,
1307    replacement: u8,
1308) {
1309    use std::arch::aarch64::*;
1310
1311    unsafe {
1312        let len = data.len();
1313        let ptr = data.as_mut_ptr();
1314        let lo_v = vdupq_n_u8(lo);
1315        let hi_v = vdupq_n_u8(hi);
1316        let repl_v = vdupq_n_u8(replacement);
1317        let mut i = 0;
1318
1319        while i + 32 <= len {
1320            let in0 = vld1q_u8(ptr.add(i));
1321            let in1 = vld1q_u8(ptr.add(i + 16));
1322            let ge0 = vcgeq_u8(in0, lo_v);
1323            let le0 = vcleq_u8(in0, hi_v);
1324            let mask0 = vandq_u8(ge0, le0);
1325            let ge1 = vcgeq_u8(in1, lo_v);
1326            let le1 = vcleq_u8(in1, hi_v);
1327            let mask1 = vandq_u8(ge1, le1);
1328            // bsl: select repl where mask, keep input where not
1329            vst1q_u8(ptr.add(i), vbslq_u8(mask0, repl_v, in0));
1330            vst1q_u8(ptr.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1331            i += 32;
1332        }
1333
1334        if i + 16 <= len {
1335            let input = vld1q_u8(ptr.add(i));
1336            let ge = vcgeq_u8(input, lo_v);
1337            let le = vcleq_u8(input, hi_v);
1338            let mask = vandq_u8(ge, le);
1339            vst1q_u8(ptr.add(i), vbslq_u8(mask, repl_v, input));
1340            i += 16;
1341        }
1342
1343        while i < len {
1344            let b = *ptr.add(i);
1345            *ptr.add(i) = if b >= lo && b <= hi { replacement } else { b };
1346            i += 1;
1347        }
1348    }
1349}
1350
1351#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1352fn translate_range_to_constant_simd_inplace(data: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1353    for b in data.iter_mut() {
1354        if *b >= lo && *b <= hi {
1355            *b = replacement;
1356        }
1357    }
1358}
1359
1360/// SIMD range-to-constant translation from src to dst (no intermediate copy needed).
1361/// Reads from src, writes translated result to dst in a single pass.
1362#[cfg(target_arch = "x86_64")]
1363fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1364    if get_simd_level() >= 3 {
1365        unsafe { translate_range_to_constant_avx2(src, dst, lo, hi, replacement) };
1366    } else {
1367        unsafe { translate_range_to_constant_sse2(src, dst, lo, hi, replacement) };
1368    }
1369}
1370
1371#[cfg(target_arch = "aarch64")]
1372fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1373    unsafe { translate_range_to_constant_neon(src, dst, lo, hi, replacement) };
1374}
1375
1376#[cfg(target_arch = "aarch64")]
1377#[target_feature(enable = "neon")]
1378unsafe fn translate_range_to_constant_neon(
1379    src: &[u8],
1380    dst: &mut [u8],
1381    lo: u8,
1382    hi: u8,
1383    replacement: u8,
1384) {
1385    use std::arch::aarch64::*;
1386
1387    unsafe {
1388        let len = src.len();
1389        let sp = src.as_ptr();
1390        let dp = dst.as_mut_ptr();
1391        let lo_v = vdupq_n_u8(lo);
1392        let hi_v = vdupq_n_u8(hi);
1393        let repl_v = vdupq_n_u8(replacement);
1394        let mut i = 0;
1395
1396        while i + 32 <= len {
1397            let in0 = vld1q_u8(sp.add(i));
1398            let in1 = vld1q_u8(sp.add(i + 16));
1399            let mask0 = vandq_u8(vcgeq_u8(in0, lo_v), vcleq_u8(in0, hi_v));
1400            let mask1 = vandq_u8(vcgeq_u8(in1, lo_v), vcleq_u8(in1, hi_v));
1401            vst1q_u8(dp.add(i), vbslq_u8(mask0, repl_v, in0));
1402            vst1q_u8(dp.add(i + 16), vbslq_u8(mask1, repl_v, in1));
1403            i += 32;
1404        }
1405
1406        if i + 16 <= len {
1407            let input = vld1q_u8(sp.add(i));
1408            let mask = vandq_u8(vcgeq_u8(input, lo_v), vcleq_u8(input, hi_v));
1409            vst1q_u8(dp.add(i), vbslq_u8(mask, repl_v, input));
1410            i += 16;
1411        }
1412
1413        while i < len {
1414            let b = *sp.add(i);
1415            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1416            i += 1;
1417        }
1418    }
1419}
1420
1421#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1422fn translate_range_to_constant_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, replacement: u8) {
1423    for (i, &b) in src.iter().enumerate() {
1424        unsafe {
1425            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi { replacement } else { b };
1426        }
1427    }
1428}
1429
1430#[cfg(target_arch = "x86_64")]
1431#[target_feature(enable = "avx2")]
1432unsafe fn translate_range_to_constant_avx2(
1433    src: &[u8],
1434    dst: &mut [u8],
1435    lo: u8,
1436    hi: u8,
1437    replacement: u8,
1438) {
1439    use std::arch::x86_64::*;
1440    unsafe {
1441        let range = hi - lo;
1442        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1443        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1444        let repl_v = _mm256_set1_epi8(replacement as i8);
1445        let zero = _mm256_setzero_si256();
1446        let len = src.len();
1447        let sp = src.as_ptr();
1448        let dp = dst.as_mut_ptr();
1449        let mut i = 0;
1450        while i + 64 <= len {
1451            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1452            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1453            let bi0 = _mm256_add_epi8(in0, bias_v);
1454            let bi1 = _mm256_add_epi8(in1, bias_v);
1455            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1456            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1457            let ir0 = _mm256_cmpeq_epi8(gt0, zero);
1458            let ir1 = _mm256_cmpeq_epi8(gt1, zero);
1459            let r0 = _mm256_blendv_epi8(in0, repl_v, ir0);
1460            let r1 = _mm256_blendv_epi8(in1, repl_v, ir1);
1461            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1462            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1463            i += 64;
1464        }
1465        if i + 32 <= len {
1466            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1467            let biased = _mm256_add_epi8(input, bias_v);
1468            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1469            let in_range = _mm256_cmpeq_epi8(gt, zero);
1470            let result = _mm256_blendv_epi8(input, repl_v, in_range);
1471            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1472            i += 32;
1473        }
1474        while i < len {
1475            let b = *sp.add(i);
1476            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1477            i += 1;
1478        }
1479    }
1480}
1481
1482#[cfg(target_arch = "x86_64")]
1483#[target_feature(enable = "sse2")]
1484unsafe fn translate_range_to_constant_sse2(
1485    src: &[u8],
1486    dst: &mut [u8],
1487    lo: u8,
1488    hi: u8,
1489    replacement: u8,
1490) {
1491    use std::arch::x86_64::*;
1492    unsafe {
1493        let range = hi - lo;
1494        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1495        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1496        let repl_v = _mm_set1_epi8(replacement as i8);
1497        let zero = _mm_setzero_si128();
1498        let len = src.len();
1499        let sp = src.as_ptr();
1500        let dp = dst.as_mut_ptr();
1501        let mut i = 0;
1502        while i + 16 <= len {
1503            let input = _mm_loadu_si128(sp.add(i) as *const _);
1504            let biased = _mm_add_epi8(input, bias_v);
1505            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1506            let in_range = _mm_cmpeq_epi8(gt, zero);
1507            let result = _mm_or_si128(
1508                _mm_and_si128(in_range, repl_v),
1509                _mm_andnot_si128(in_range, input),
1510            );
1511            _mm_storeu_si128(dp.add(i) as *mut _, result);
1512            i += 16;
1513        }
1514        while i < len {
1515            let b = *sp.add(i);
1516            *dp.add(i) = if b >= lo && b <= hi { replacement } else { b };
1517            i += 1;
1518        }
1519    }
1520}
1521
1522/// SIMD-accelerated range translation for mmap'd data.
1523/// For tables where only a contiguous range [lo..=hi] is translated by a constant offset,
1524/// uses AVX2 (32 bytes/iter) or SSE2 (16 bytes/iter) vectorized arithmetic.
1525/// When dst is 32-byte aligned (true for large Vec allocations from mmap), uses
1526/// nontemporal stores to bypass cache, avoiding read-for-ownership overhead and
1527/// reducing memory traffic by ~33% for streaming writes.
1528#[cfg(target_arch = "x86_64")]
1529fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1530    if get_simd_level() >= 3 {
1531        // Use nontemporal stores when dst is 32-byte aligned (typical for large allocs)
1532        if dst.as_ptr() as usize & 31 == 0 {
1533            unsafe { translate_range_avx2_nt(src, dst, lo, hi, offset) };
1534        } else {
1535            unsafe { translate_range_avx2(src, dst, lo, hi, offset) };
1536        }
1537    } else {
1538        unsafe { translate_range_sse2(src, dst, lo, hi, offset) };
1539    }
1540}
1541
1542#[cfg(target_arch = "x86_64")]
1543#[target_feature(enable = "avx2")]
1544unsafe fn translate_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1545    use std::arch::x86_64::*;
1546
1547    unsafe {
1548        let range = hi - lo;
1549        // Bias: shift range so lo maps to -128 (signed min).
1550        // For input in [lo, hi]: biased = input + (0x80 - lo) is in [-128, -128+range].
1551        // For input < lo: biased wraps to large positive (signed), > threshold.
1552        // For input > hi: biased > -128+range, > threshold.
1553        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1554        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1555        let offset_v = _mm256_set1_epi8(offset);
1556        let zero = _mm256_setzero_si256();
1557
1558        let len = src.len();
1559        let sp = src.as_ptr();
1560        let dp = dst.as_mut_ptr();
1561        let mut i = 0;
1562
1563        // 2x unrolled: process 64 bytes per iteration for better ILP.
1564        // Load/compute on the second vector while the first is in-flight.
1565        while i + 64 <= len {
1566            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1567            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1568            let bi0 = _mm256_add_epi8(in0, bias_v);
1569            let bi1 = _mm256_add_epi8(in1, bias_v);
1570            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1571            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1572            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1573            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1574            let om0 = _mm256_and_si256(m0, offset_v);
1575            let om1 = _mm256_and_si256(m1, offset_v);
1576            let r0 = _mm256_add_epi8(in0, om0);
1577            let r1 = _mm256_add_epi8(in1, om1);
1578            _mm256_storeu_si256(dp.add(i) as *mut _, r0);
1579            _mm256_storeu_si256(dp.add(i + 32) as *mut _, r1);
1580            i += 64;
1581        }
1582
1583        // Remaining 32-byte chunk
1584        if i + 32 <= len {
1585            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1586            let biased = _mm256_add_epi8(input, bias_v);
1587            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1588            let mask = _mm256_cmpeq_epi8(gt, zero);
1589            let offset_masked = _mm256_and_si256(mask, offset_v);
1590            let result = _mm256_add_epi8(input, offset_masked);
1591            _mm256_storeu_si256(dp.add(i) as *mut _, result);
1592            i += 32;
1593        }
1594
1595        // SSE2 tail for 16-byte remainder
1596        if i + 16 <= len {
1597            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1598            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1599            let offset_v128 = _mm_set1_epi8(offset);
1600            let zero128 = _mm_setzero_si128();
1601
1602            let input = _mm_loadu_si128(sp.add(i) as *const _);
1603            let biased = _mm_add_epi8(input, bias_v128);
1604            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1605            let mask = _mm_cmpeq_epi8(gt, zero128);
1606            let offset_masked = _mm_and_si128(mask, offset_v128);
1607            let result = _mm_add_epi8(input, offset_masked);
1608            _mm_storeu_si128(dp.add(i) as *mut _, result);
1609            i += 16;
1610        }
1611
1612        // Scalar tail
1613        while i < len {
1614            let b = *sp.add(i);
1615            *dp.add(i) = if b >= lo && b <= hi {
1616                b.wrapping_add(offset as u8)
1617            } else {
1618                b
1619            };
1620            i += 1;
1621        }
1622    }
1623}
1624
1625/// Nontemporal variant of translate_range_avx2: uses _mm256_stream_si256 for stores.
1626/// This bypasses the cache for writes, avoiding read-for-ownership (RFO) traffic on
1627/// the destination buffer. For streaming translate (src → dst, dst not read again),
1628/// this reduces memory traffic by ~33% (10MB input: 20MB vs 30MB total traffic).
1629/// Requires dst to be 32-byte aligned (guaranteed for large Vec/mmap allocations).
1630#[cfg(target_arch = "x86_64")]
1631#[target_feature(enable = "avx2")]
1632unsafe fn translate_range_avx2_nt(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1633    use std::arch::x86_64::*;
1634
1635    unsafe {
1636        let range = hi - lo;
1637        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1638        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1639        let offset_v = _mm256_set1_epi8(offset);
1640        let zero = _mm256_setzero_si256();
1641
1642        let len = src.len();
1643        let sp = src.as_ptr();
1644        let dp = dst.as_mut_ptr();
1645        let mut i = 0;
1646
1647        // 2x unrolled with nontemporal stores
1648        while i + 64 <= len {
1649            let in0 = _mm256_loadu_si256(sp.add(i) as *const _);
1650            let in1 = _mm256_loadu_si256(sp.add(i + 32) as *const _);
1651            let bi0 = _mm256_add_epi8(in0, bias_v);
1652            let bi1 = _mm256_add_epi8(in1, bias_v);
1653            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1654            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1655            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1656            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1657            let om0 = _mm256_and_si256(m0, offset_v);
1658            let om1 = _mm256_and_si256(m1, offset_v);
1659            let r0 = _mm256_add_epi8(in0, om0);
1660            let r1 = _mm256_add_epi8(in1, om1);
1661            _mm256_stream_si256(dp.add(i) as *mut _, r0);
1662            _mm256_stream_si256(dp.add(i + 32) as *mut _, r1);
1663            i += 64;
1664        }
1665
1666        // Remaining 32-byte chunk (still nontemporal if aligned)
1667        if i + 32 <= len {
1668            let input = _mm256_loadu_si256(sp.add(i) as *const _);
1669            let biased = _mm256_add_epi8(input, bias_v);
1670            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1671            let mask = _mm256_cmpeq_epi8(gt, zero);
1672            let offset_masked = _mm256_and_si256(mask, offset_v);
1673            let result = _mm256_add_epi8(input, offset_masked);
1674            _mm256_stream_si256(dp.add(i) as *mut _, result);
1675            i += 32;
1676        }
1677
1678        // SSE2 tail for 16-byte remainder (regular store — only 16 bytes)
1679        if i + 16 <= len {
1680            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1681            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1682            let offset_v128 = _mm_set1_epi8(offset);
1683            let zero128 = _mm_setzero_si128();
1684
1685            let input = _mm_loadu_si128(sp.add(i) as *const _);
1686            let biased = _mm_add_epi8(input, bias_v128);
1687            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1688            let mask = _mm_cmpeq_epi8(gt, zero128);
1689            let offset_masked = _mm_and_si128(mask, offset_v128);
1690            let result = _mm_add_epi8(input, offset_masked);
1691            _mm_storeu_si128(dp.add(i) as *mut _, result);
1692            i += 16;
1693        }
1694
1695        // Scalar tail
1696        while i < len {
1697            let b = *sp.add(i);
1698            *dp.add(i) = if b >= lo && b <= hi {
1699                b.wrapping_add(offset as u8)
1700            } else {
1701                b
1702            };
1703            i += 1;
1704        }
1705
1706        // Fence: ensure nontemporal stores are visible before write() syscall
1707        _mm_sfence();
1708    }
1709}
1710
1711#[cfg(target_arch = "x86_64")]
1712#[target_feature(enable = "sse2")]
1713unsafe fn translate_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1714    use std::arch::x86_64::*;
1715
1716    unsafe {
1717        let range = hi - lo;
1718        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1719        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1720        let offset_v = _mm_set1_epi8(offset);
1721        let zero = _mm_setzero_si128();
1722
1723        let len = src.len();
1724        let mut i = 0;
1725
1726        while i + 16 <= len {
1727            let input = _mm_loadu_si128(src.as_ptr().add(i) as *const _);
1728            let biased = _mm_add_epi8(input, bias_v);
1729            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1730            let mask = _mm_cmpeq_epi8(gt, zero);
1731            let offset_masked = _mm_and_si128(mask, offset_v);
1732            let result = _mm_add_epi8(input, offset_masked);
1733            _mm_storeu_si128(dst.as_mut_ptr().add(i) as *mut _, result);
1734            i += 16;
1735        }
1736
1737        while i < len {
1738            let b = *src.get_unchecked(i);
1739            *dst.get_unchecked_mut(i) = if b >= lo && b <= hi {
1740                b.wrapping_add(offset as u8)
1741            } else {
1742                b
1743            };
1744            i += 1;
1745        }
1746    }
1747}
1748
1749/// ARM64 NEON-accelerated range translation.
1750/// Processes 16 bytes per iteration using vectorized range check + conditional add.
1751#[cfg(target_arch = "aarch64")]
1752fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1753    unsafe { translate_range_neon(src, dst, lo, hi, offset) };
1754}
1755
1756#[cfg(target_arch = "aarch64")]
1757#[target_feature(enable = "neon")]
1758unsafe fn translate_range_neon(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1759    use std::arch::aarch64::*;
1760
1761    unsafe {
1762        let len = src.len();
1763        let sp = src.as_ptr();
1764        let dp = dst.as_mut_ptr();
1765        let lo_v = vdupq_n_u8(lo);
1766        let hi_v = vdupq_n_u8(hi);
1767        let offset_v = vdupq_n_s8(offset);
1768        let mut i = 0;
1769
1770        // 2x unrolled: process 32 bytes per iteration
1771        while i + 32 <= len {
1772            let in0 = vld1q_u8(sp.add(i));
1773            let in1 = vld1q_u8(sp.add(i + 16));
1774            // Range check: (b >= lo) & (b <= hi)
1775            let ge0 = vcgeq_u8(in0, lo_v);
1776            let le0 = vcleq_u8(in0, hi_v);
1777            let mask0 = vandq_u8(ge0, le0);
1778            let ge1 = vcgeq_u8(in1, lo_v);
1779            let le1 = vcleq_u8(in1, hi_v);
1780            let mask1 = vandq_u8(ge1, le1);
1781            // Conditional add: in + (offset & mask)
1782            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
1783            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
1784            let r0 = vaddq_u8(in0, off0);
1785            let r1 = vaddq_u8(in1, off1);
1786            vst1q_u8(dp.add(i), r0);
1787            vst1q_u8(dp.add(i + 16), r1);
1788            i += 32;
1789        }
1790
1791        if i + 16 <= len {
1792            let input = vld1q_u8(sp.add(i));
1793            let ge = vcgeq_u8(input, lo_v);
1794            let le = vcleq_u8(input, hi_v);
1795            let mask = vandq_u8(ge, le);
1796            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
1797            vst1q_u8(dp.add(i), vaddq_u8(input, off));
1798            i += 16;
1799        }
1800
1801        while i < len {
1802            let b = *sp.add(i);
1803            *dp.add(i) = if b >= lo && b <= hi {
1804                b.wrapping_add(offset as u8)
1805            } else {
1806                b
1807            };
1808            i += 1;
1809        }
1810    }
1811}
1812
1813/// Scalar range translation fallback for non-x86_64, non-aarch64 platforms.
1814#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
1815fn translate_range_simd(src: &[u8], dst: &mut [u8], lo: u8, hi: u8, offset: i8) {
1816    let offset_u8 = offset as u8;
1817    let range = hi.wrapping_sub(lo);
1818    unsafe {
1819        let sp = src.as_ptr();
1820        let dp = dst.as_mut_ptr();
1821        let len = src.len();
1822        let mut i = 0;
1823        while i + 8 <= len {
1824            macro_rules! do_byte {
1825                ($off:expr) => {{
1826                    let b = *sp.add(i + $off);
1827                    let in_range = b.wrapping_sub(lo) <= range;
1828                    *dp.add(i + $off) = if in_range {
1829                        b.wrapping_add(offset_u8)
1830                    } else {
1831                        b
1832                    };
1833                }};
1834            }
1835            do_byte!(0);
1836            do_byte!(1);
1837            do_byte!(2);
1838            do_byte!(3);
1839            do_byte!(4);
1840            do_byte!(5);
1841            do_byte!(6);
1842            do_byte!(7);
1843            i += 8;
1844        }
1845        while i < len {
1846            let b = *sp.add(i);
1847            let in_range = b.wrapping_sub(lo) <= range;
1848            *dp.add(i) = if in_range {
1849                b.wrapping_add(offset_u8)
1850            } else {
1851                b
1852            };
1853            i += 1;
1854        }
1855    }
1856}
1857
1858// ============================================================================
1859// In-place SIMD range translation (saves one buffer allocation in streaming)
1860// ============================================================================
1861
1862/// In-place SIMD-accelerated range translation.
1863/// Translates bytes in [lo..=hi] by adding `offset`, leaving others unchanged.
1864/// Operates on the buffer in-place, eliminating the need for a separate output buffer.
1865#[cfg(target_arch = "x86_64")]
1866fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1867    if get_simd_level() >= 3 {
1868        unsafe { translate_range_avx2_inplace(data, lo, hi, offset) };
1869    } else {
1870        unsafe { translate_range_sse2_inplace(data, lo, hi, offset) };
1871    }
1872}
1873
1874#[cfg(target_arch = "x86_64")]
1875#[target_feature(enable = "avx2")]
1876unsafe fn translate_range_avx2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1877    use std::arch::x86_64::*;
1878
1879    unsafe {
1880        let range = hi - lo;
1881        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1882        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
1883        let offset_v = _mm256_set1_epi8(offset);
1884        let zero = _mm256_setzero_si256();
1885
1886        let len = data.len();
1887        let ptr = data.as_mut_ptr();
1888        let mut i = 0;
1889
1890        // 2x unrolled: process 64 bytes per iteration for better ILP
1891        while i + 64 <= len {
1892            let in0 = _mm256_loadu_si256(ptr.add(i) as *const _);
1893            let in1 = _mm256_loadu_si256(ptr.add(i + 32) as *const _);
1894            let bi0 = _mm256_add_epi8(in0, bias_v);
1895            let bi1 = _mm256_add_epi8(in1, bias_v);
1896            let gt0 = _mm256_cmpgt_epi8(bi0, threshold_v);
1897            let gt1 = _mm256_cmpgt_epi8(bi1, threshold_v);
1898            let m0 = _mm256_cmpeq_epi8(gt0, zero);
1899            let m1 = _mm256_cmpeq_epi8(gt1, zero);
1900            let om0 = _mm256_and_si256(m0, offset_v);
1901            let om1 = _mm256_and_si256(m1, offset_v);
1902            let r0 = _mm256_add_epi8(in0, om0);
1903            let r1 = _mm256_add_epi8(in1, om1);
1904            _mm256_storeu_si256(ptr.add(i) as *mut _, r0);
1905            _mm256_storeu_si256(ptr.add(i + 32) as *mut _, r1);
1906            i += 64;
1907        }
1908
1909        // Remaining 32-byte chunk
1910        if i + 32 <= len {
1911            let input = _mm256_loadu_si256(ptr.add(i) as *const _);
1912            let biased = _mm256_add_epi8(input, bias_v);
1913            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
1914            let mask = _mm256_cmpeq_epi8(gt, zero);
1915            let offset_masked = _mm256_and_si256(mask, offset_v);
1916            let result = _mm256_add_epi8(input, offset_masked);
1917            _mm256_storeu_si256(ptr.add(i) as *mut _, result);
1918            i += 32;
1919        }
1920
1921        if i + 16 <= len {
1922            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1923            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1924            let offset_v128 = _mm_set1_epi8(offset);
1925            let zero128 = _mm_setzero_si128();
1926
1927            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1928            let biased = _mm_add_epi8(input, bias_v128);
1929            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
1930            let mask = _mm_cmpeq_epi8(gt, zero128);
1931            let offset_masked = _mm_and_si128(mask, offset_v128);
1932            let result = _mm_add_epi8(input, offset_masked);
1933            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1934            i += 16;
1935        }
1936
1937        while i < len {
1938            let b = *ptr.add(i);
1939            *ptr.add(i) = if b >= lo && b <= hi {
1940                b.wrapping_add(offset as u8)
1941            } else {
1942                b
1943            };
1944            i += 1;
1945        }
1946    }
1947}
1948
1949#[cfg(target_arch = "x86_64")]
1950#[target_feature(enable = "sse2")]
1951unsafe fn translate_range_sse2_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1952    use std::arch::x86_64::*;
1953
1954    unsafe {
1955        let range = hi - lo;
1956        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
1957        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
1958        let offset_v = _mm_set1_epi8(offset);
1959        let zero = _mm_setzero_si128();
1960
1961        let len = data.len();
1962        let ptr = data.as_mut_ptr();
1963        let mut i = 0;
1964
1965        while i + 16 <= len {
1966            let input = _mm_loadu_si128(ptr.add(i) as *const _);
1967            let biased = _mm_add_epi8(input, bias_v);
1968            let gt = _mm_cmpgt_epi8(biased, threshold_v);
1969            let mask = _mm_cmpeq_epi8(gt, zero);
1970            let offset_masked = _mm_and_si128(mask, offset_v);
1971            let result = _mm_add_epi8(input, offset_masked);
1972            _mm_storeu_si128(ptr.add(i) as *mut _, result);
1973            i += 16;
1974        }
1975
1976        while i < len {
1977            let b = *ptr.add(i);
1978            *ptr.add(i) = if b >= lo && b <= hi {
1979                b.wrapping_add(offset as u8)
1980            } else {
1981                b
1982            };
1983            i += 1;
1984        }
1985    }
1986}
1987
1988#[cfg(target_arch = "aarch64")]
1989fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1990    unsafe { translate_range_neon_inplace(data, lo, hi, offset) };
1991}
1992
1993#[cfg(target_arch = "aarch64")]
1994#[target_feature(enable = "neon")]
1995unsafe fn translate_range_neon_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
1996    use std::arch::aarch64::*;
1997
1998    unsafe {
1999        let len = data.len();
2000        let ptr = data.as_mut_ptr();
2001        let lo_v = vdupq_n_u8(lo);
2002        let hi_v = vdupq_n_u8(hi);
2003        let offset_v = vdupq_n_s8(offset);
2004        let mut i = 0;
2005
2006        while i + 32 <= len {
2007            let in0 = vld1q_u8(ptr.add(i));
2008            let in1 = vld1q_u8(ptr.add(i + 16));
2009            let ge0 = vcgeq_u8(in0, lo_v);
2010            let le0 = vcleq_u8(in0, hi_v);
2011            let mask0 = vandq_u8(ge0, le0);
2012            let ge1 = vcgeq_u8(in1, lo_v);
2013            let le1 = vcleq_u8(in1, hi_v);
2014            let mask1 = vandq_u8(ge1, le1);
2015            let off0 = vandq_u8(mask0, vreinterpretq_u8_s8(offset_v));
2016            let off1 = vandq_u8(mask1, vreinterpretq_u8_s8(offset_v));
2017            vst1q_u8(ptr.add(i), vaddq_u8(in0, off0));
2018            vst1q_u8(ptr.add(i + 16), vaddq_u8(in1, off1));
2019            i += 32;
2020        }
2021
2022        if i + 16 <= len {
2023            let input = vld1q_u8(ptr.add(i));
2024            let ge = vcgeq_u8(input, lo_v);
2025            let le = vcleq_u8(input, hi_v);
2026            let mask = vandq_u8(ge, le);
2027            let off = vandq_u8(mask, vreinterpretq_u8_s8(offset_v));
2028            vst1q_u8(ptr.add(i), vaddq_u8(input, off));
2029            i += 16;
2030        }
2031
2032        while i < len {
2033            let b = *ptr.add(i);
2034            if b >= lo && b <= hi {
2035                *ptr.add(i) = b.wrapping_add(offset as u8);
2036            }
2037            i += 1;
2038        }
2039    }
2040}
2041
2042#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
2043fn translate_range_simd_inplace(data: &mut [u8], lo: u8, hi: u8, offset: i8) {
2044    let offset_u8 = offset as u8;
2045    let range = hi.wrapping_sub(lo);
2046    for b in data.iter_mut() {
2047        if b.wrapping_sub(lo) <= range {
2048            *b = b.wrapping_add(offset_u8);
2049        }
2050    }
2051}
2052
2053// ============================================================================
2054// SIMD range deletion (x86_64)
2055// ============================================================================
2056
2057/// Detect if ALL delete characters form a single contiguous byte range [lo..=hi].
2058/// Returns Some((lo, hi)) if so. This is true for common classes:
2059/// - `[:digit:]` = 0x30..=0x39
2060/// - `a-z` = 0x61..=0x7A
2061/// - `A-Z` = 0x41..=0x5A
2062#[inline]
2063fn detect_delete_range(chars: &[u8]) -> Option<(u8, u8)> {
2064    if chars.is_empty() {
2065        return None;
2066    }
2067    let mut lo = chars[0];
2068    let mut hi = chars[0];
2069    for &c in &chars[1..] {
2070        if c < lo {
2071            lo = c;
2072        }
2073        if c > hi {
2074            hi = c;
2075        }
2076    }
2077    // Check that the range size matches the number of chars (no gaps)
2078    // Cast to usize before +1 to avoid u8 overflow when hi=255, lo=0 (range=256)
2079    if (hi as usize - lo as usize + 1) == chars.len() {
2080        Some((lo, hi))
2081    } else {
2082        None
2083    }
2084}
2085
2086/// SIMD-accelerated delete for contiguous byte ranges.
2087/// Uses the same bias+threshold trick as range translate to identify bytes in [lo..=hi],
2088/// then compacts output by skipping matched bytes.
2089#[cfg(target_arch = "x86_64")]
2090fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2091    if get_simd_level() >= 3 {
2092        unsafe { delete_range_avx2(src, dst, lo, hi) }
2093    } else {
2094        unsafe { delete_range_sse2(src, dst, lo, hi) }
2095    }
2096}
2097
2098#[cfg(target_arch = "x86_64")]
2099#[target_feature(enable = "avx2")]
2100unsafe fn delete_range_avx2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2101    use std::arch::x86_64::*;
2102
2103    unsafe {
2104        let range = hi - lo;
2105        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2106        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2107        let zero = _mm256_setzero_si256();
2108
2109        let len = src.len();
2110        let sp = src.as_ptr();
2111        let dp = dst.as_mut_ptr();
2112        let mut ri = 0;
2113        let mut wp = 0;
2114
2115        while ri + 32 <= len {
2116            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
2117            let biased = _mm256_add_epi8(input, bias_v);
2118            // gt = 0xFF where biased > threshold (OUT of range = KEEP)
2119            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2120            // in_range = 0xFF where IN range (to DELETE), 0 where to KEEP
2121            let in_range = _mm256_cmpeq_epi8(gt, zero);
2122            // keep_mask bits: 1 = keep (NOT in range)
2123            let keep_mask = !(_mm256_movemask_epi8(in_range) as u32);
2124
2125            if keep_mask == 0xFFFFFFFF {
2126                // All 32 bytes are kept — bulk copy
2127                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
2128                wp += 32;
2129            } else if keep_mask != 0 {
2130                // Partial keep — per-lane processing with all-keep fast paths.
2131                // For 4% delete rate, ~72% of 8-byte lanes are all-keep even
2132                // within partial 32-byte blocks. The per-lane check avoids
2133                // the LUT compact overhead for these clean lanes.
2134                let m0 = keep_mask as u8;
2135                let m1 = (keep_mask >> 8) as u8;
2136                let m2 = (keep_mask >> 16) as u8;
2137                let m3 = (keep_mask >> 24) as u8;
2138
2139                if m0 == 0xFF {
2140                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2141                } else if m0 != 0 {
2142                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2143                }
2144                let c0 = m0.count_ones() as usize;
2145
2146                if m1 == 0xFF {
2147                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2148                } else if m1 != 0 {
2149                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2150                }
2151                let c1 = m1.count_ones() as usize;
2152
2153                if m2 == 0xFF {
2154                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
2155                } else if m2 != 0 {
2156                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
2157                }
2158                let c2 = m2.count_ones() as usize;
2159
2160                if m3 == 0xFF {
2161                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
2162                } else if m3 != 0 {
2163                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
2164                }
2165                let c3 = m3.count_ones() as usize;
2166                wp += c0 + c1 + c2 + c3;
2167            }
2168            // else: keep_mask == 0 means all bytes deleted, skip entirely
2169            ri += 32;
2170        }
2171
2172        // SSE2 tail for 16-byte remainder
2173        if ri + 16 <= len {
2174            let bias_v128 = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2175            let threshold_v128 = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2176            let zero128 = _mm_setzero_si128();
2177
2178            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2179            let biased = _mm_add_epi8(input, bias_v128);
2180            let gt = _mm_cmpgt_epi8(biased, threshold_v128);
2181            let in_range = _mm_cmpeq_epi8(gt, zero128);
2182            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2183
2184            if keep_mask == 0xFFFF {
2185                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2186                wp += 16;
2187            } else if keep_mask != 0 {
2188                let m0 = keep_mask as u8;
2189                let m1 = (keep_mask >> 8) as u8;
2190                if m0 == 0xFF {
2191                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2192                } else if m0 != 0 {
2193                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
2194                }
2195                let c0 = m0.count_ones() as usize;
2196                if m1 == 0xFF {
2197                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2198                } else if m1 != 0 {
2199                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
2200                }
2201                wp += c0 + m1.count_ones() as usize;
2202            }
2203            ri += 16;
2204        }
2205
2206        // Scalar tail — branchless: always store, advance wp only for kept bytes
2207        while ri < len {
2208            let b = *sp.add(ri);
2209            *dp.add(wp) = b;
2210            wp += (b < lo || b > hi) as usize;
2211            ri += 1;
2212        }
2213
2214        wp
2215    }
2216}
2217
2218/// Compact 8 source bytes into contiguous output bytes using a keep mask.
2219/// Each bit in `mask` indicates whether the corresponding byte should be kept.
2220/// Uses a precomputed LUT: for each 8-bit mask, the LUT stores indices of set bits.
2221/// Always performs 8 unconditional stores (extra stores past popcount are harmless
2222/// since the write pointer only advances by popcount, and subsequent lanes overwrite).
2223/// This eliminates the serial tzcnt→blsr dependency chain (~28 cycles) in favor of
2224/// independent indexed loads and stores (~15 cycles).
2225#[cfg(target_arch = "x86_64")]
2226#[inline(always)]
2227unsafe fn compact_8bytes(src: *const u8, dst: *mut u8, mask: u8) {
2228    unsafe {
2229        let idx = COMPACT_LUT.get_unchecked(mask as usize);
2230        *dst = *src.add(*idx.get_unchecked(0) as usize);
2231        *dst.add(1) = *src.add(*idx.get_unchecked(1) as usize);
2232        *dst.add(2) = *src.add(*idx.get_unchecked(2) as usize);
2233        *dst.add(3) = *src.add(*idx.get_unchecked(3) as usize);
2234        *dst.add(4) = *src.add(*idx.get_unchecked(4) as usize);
2235        *dst.add(5) = *src.add(*idx.get_unchecked(5) as usize);
2236        *dst.add(6) = *src.add(*idx.get_unchecked(6) as usize);
2237        *dst.add(7) = *src.add(*idx.get_unchecked(7) as usize);
2238    }
2239}
2240
2241/// SSSE3 pshufb-based byte compaction. Loads 8 source bytes into an XMM register,
2242/// shuffles kept bytes to the front using COMPACT_LUT + _mm_shuffle_epi8, stores 8 bytes.
2243/// ~4x faster than scalar compact_8bytes: 1 pshufb vs 8 individual indexed byte copies.
2244/// Requires SSSE3; safe to call from AVX2 functions (which imply SSSE3).
2245#[cfg(target_arch = "x86_64")]
2246#[target_feature(enable = "ssse3")]
2247#[inline]
2248unsafe fn compact_8bytes_simd(src: *const u8, dst: *mut u8, mask: u8) {
2249    use std::arch::x86_64::*;
2250    unsafe {
2251        let src_v = _mm_loadl_epi64(src as *const _);
2252        let shuf = _mm_loadl_epi64(COMPACT_LUT.get_unchecked(mask as usize).as_ptr() as *const _);
2253        let out_v = _mm_shuffle_epi8(src_v, shuf);
2254        _mm_storel_epi64(dst as *mut _, out_v);
2255    }
2256}
2257
2258#[cfg(target_arch = "x86_64")]
2259#[target_feature(enable = "sse2")]
2260unsafe fn delete_range_sse2(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2261    use std::arch::x86_64::*;
2262
2263    unsafe {
2264        let range = hi - lo;
2265        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2266        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
2267        let zero = _mm_setzero_si128();
2268
2269        let len = src.len();
2270        let sp = src.as_ptr();
2271        let dp = dst.as_mut_ptr();
2272        let mut ri = 0;
2273        let mut wp = 0;
2274
2275        while ri + 16 <= len {
2276            let input = _mm_loadu_si128(sp.add(ri) as *const _);
2277            let biased = _mm_add_epi8(input, bias_v);
2278            let gt = _mm_cmpgt_epi8(biased, threshold_v);
2279            let in_range = _mm_cmpeq_epi8(gt, zero);
2280            let keep_mask = !(_mm_movemask_epi8(in_range) as u32) & 0xFFFF;
2281
2282            if keep_mask == 0xFFFF {
2283                // All 16 bytes kept — bulk copy
2284                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 16);
2285                wp += 16;
2286            } else if keep_mask != 0 {
2287                let m0 = keep_mask as u8;
2288                let m1 = (keep_mask >> 8) as u8;
2289                if m0 == 0xFF {
2290                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
2291                } else if m0 != 0 {
2292                    compact_8bytes(sp.add(ri), dp.add(wp), m0);
2293                }
2294                let c0 = m0.count_ones() as usize;
2295                if m1 == 0xFF {
2296                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
2297                } else if m1 != 0 {
2298                    compact_8bytes(sp.add(ri + 8), dp.add(wp + c0), m1);
2299                }
2300                wp += c0 + m1.count_ones() as usize;
2301            }
2302            ri += 16;
2303        }
2304
2305        // Scalar tail — branchless
2306        while ri < len {
2307            let b = *sp.add(ri);
2308            *dp.add(wp) = b;
2309            wp += (b < lo || b > hi) as usize;
2310            ri += 1;
2311        }
2312
2313        wp
2314    }
2315}
2316
2317/// Branchless range delete fallback for non-x86_64 (ARM64, etc.).
2318/// Unconditional store + conditional pointer advance eliminates branch
2319/// mispredictions. Unrolled 8x for better ILP on out-of-order cores.
2320#[cfg(not(target_arch = "x86_64"))]
2321fn delete_range_chunk(src: &[u8], dst: &mut [u8], lo: u8, hi: u8) -> usize {
2322    let len = src.len();
2323    let sp = src.as_ptr();
2324    let dp = dst.as_mut_ptr();
2325    let mut wp: usize = 0;
2326    let mut i: usize = 0;
2327
2328    // Unrolled branchless loop — 8 bytes per iteration
2329    while i + 8 <= len {
2330        unsafe {
2331            let b0 = *sp.add(i);
2332            *dp.add(wp) = b0;
2333            wp += (b0 < lo || b0 > hi) as usize;
2334            let b1 = *sp.add(i + 1);
2335            *dp.add(wp) = b1;
2336            wp += (b1 < lo || b1 > hi) as usize;
2337            let b2 = *sp.add(i + 2);
2338            *dp.add(wp) = b2;
2339            wp += (b2 < lo || b2 > hi) as usize;
2340            let b3 = *sp.add(i + 3);
2341            *dp.add(wp) = b3;
2342            wp += (b3 < lo || b3 > hi) as usize;
2343            let b4 = *sp.add(i + 4);
2344            *dp.add(wp) = b4;
2345            wp += (b4 < lo || b4 > hi) as usize;
2346            let b5 = *sp.add(i + 5);
2347            *dp.add(wp) = b5;
2348            wp += (b5 < lo || b5 > hi) as usize;
2349            let b6 = *sp.add(i + 6);
2350            *dp.add(wp) = b6;
2351            wp += (b6 < lo || b6 > hi) as usize;
2352            let b7 = *sp.add(i + 7);
2353            *dp.add(wp) = b7;
2354            wp += (b7 < lo || b7 > hi) as usize;
2355        }
2356        i += 8;
2357    }
2358
2359    // Scalar tail
2360    while i < len {
2361        unsafe {
2362            let b = *sp.add(i);
2363            *dp.add(wp) = b;
2364            wp += (b < lo || b > hi) as usize;
2365        }
2366        i += 1;
2367    }
2368
2369    wp
2370}
2371
2372/// Streaming delete for contiguous byte ranges using SIMD range detection.
2373/// Uses 4MB buffer to reduce syscalls (delete is compute-light, I/O bound).
2374/// When no bytes are deleted from a chunk (common for data with few matches),
2375/// writes directly from the source buffer to avoid the copy overhead.
2376fn delete_range_streaming(
2377    lo: u8,
2378    hi: u8,
2379    reader: &mut impl Read,
2380    writer: &mut impl Write,
2381) -> io::Result<()> {
2382    let mut buf = alloc_uninit_vec(STREAM_BUF);
2383    loop {
2384        let n = read_full(reader, &mut buf)?;
2385        if n == 0 {
2386            break;
2387        }
2388        let wp = delete_range_inplace(&mut buf, n, lo, hi);
2389        if wp > 0 {
2390            writer.write_all(&buf[..wp])?;
2391        }
2392    }
2393    Ok(())
2394}
2395
2396/// In-place range delete: SIMD scan for all-keep blocks + branchless scalar compaction.
2397/// Uses a single buffer — reads at position ri, writes at position wp (wp <= ri always).
2398#[inline]
2399fn delete_range_inplace(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2400    #[cfg(target_arch = "x86_64")]
2401    {
2402        let level = get_simd_level();
2403        if level >= 3 {
2404            return unsafe { delete_range_inplace_avx2(buf, n, lo, hi) };
2405        }
2406    }
2407    // Scalar fallback: branchless in-place delete
2408    let ptr = buf.as_mut_ptr();
2409    let mut ri = 0;
2410    let mut wp = 0;
2411    unsafe {
2412        while ri + 8 <= n {
2413            let b0 = *ptr.add(ri);
2414            let b1 = *ptr.add(ri + 1);
2415            let b2 = *ptr.add(ri + 2);
2416            let b3 = *ptr.add(ri + 3);
2417            let b4 = *ptr.add(ri + 4);
2418            let b5 = *ptr.add(ri + 5);
2419            let b6 = *ptr.add(ri + 6);
2420            let b7 = *ptr.add(ri + 7);
2421            *ptr.add(wp) = b0;
2422            wp += (b0 < lo || b0 > hi) as usize;
2423            *ptr.add(wp) = b1;
2424            wp += (b1 < lo || b1 > hi) as usize;
2425            *ptr.add(wp) = b2;
2426            wp += (b2 < lo || b2 > hi) as usize;
2427            *ptr.add(wp) = b3;
2428            wp += (b3 < lo || b3 > hi) as usize;
2429            *ptr.add(wp) = b4;
2430            wp += (b4 < lo || b4 > hi) as usize;
2431            *ptr.add(wp) = b5;
2432            wp += (b5 < lo || b5 > hi) as usize;
2433            *ptr.add(wp) = b6;
2434            wp += (b6 < lo || b6 > hi) as usize;
2435            *ptr.add(wp) = b7;
2436            wp += (b7 < lo || b7 > hi) as usize;
2437            ri += 8;
2438        }
2439        while ri < n {
2440            let b = *ptr.add(ri);
2441            *ptr.add(wp) = b;
2442            wp += (b < lo || b > hi) as usize;
2443            ri += 1;
2444        }
2445    }
2446    wp
2447}
2448
2449/// AVX2 in-place range delete: scan 32 bytes at a time, skip all-keep blocks,
2450/// branchless scalar compaction for mixed blocks.
2451#[cfg(target_arch = "x86_64")]
2452#[target_feature(enable = "avx2")]
2453unsafe fn delete_range_inplace_avx2(buf: &mut [u8], n: usize, lo: u8, hi: u8) -> usize {
2454    use std::arch::x86_64::*;
2455
2456    unsafe {
2457        let range = hi - lo;
2458        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
2459        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
2460        let zero = _mm256_setzero_si256();
2461
2462        let ptr = buf.as_mut_ptr();
2463        let mut ri = 0;
2464        let mut wp = 0;
2465
2466        while ri + 32 <= n {
2467            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
2468            let biased = _mm256_add_epi8(input, bias_v);
2469            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
2470            let in_range = _mm256_cmpeq_epi8(gt, zero);
2471            let del_mask = _mm256_movemask_epi8(in_range) as u32;
2472
2473            if del_mask == 0 {
2474                // All 32 bytes kept
2475                if wp != ri {
2476                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
2477                }
2478                wp += 32;
2479            } else if del_mask != 0xFFFFFFFF {
2480                // Mixed block: pshufb-based 8-byte compaction.
2481                // Process 4 × 8-byte sub-chunks using COMPACT_LUT + pshufb.
2482                // Each sub-chunk: load 8 bytes into register (safe for overlap),
2483                // shuffle kept bytes to front, store. 4 SIMD ops vs 32 scalar.
2484                let keep_mask = !del_mask;
2485                let m0 = keep_mask as u8;
2486                let m1 = (keep_mask >> 8) as u8;
2487                let m2 = (keep_mask >> 16) as u8;
2488                let m3 = (keep_mask >> 24) as u8;
2489
2490                let c0 = m0.count_ones() as usize;
2491                let c1 = m1.count_ones() as usize;
2492                let c2 = m2.count_ones() as usize;
2493                let c3 = m3.count_ones() as usize;
2494
2495                // Sub-chunk 0: bytes 0-7
2496                if m0 == 0xFF {
2497                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 8);
2498                } else if m0 != 0 {
2499                    let src_v = _mm_loadl_epi64(ptr.add(ri) as *const _);
2500                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m0 as usize].as_ptr() as *const _);
2501                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2502                    _mm_storel_epi64(ptr.add(wp) as *mut _, out_v);
2503                }
2504
2505                // Sub-chunk 1: bytes 8-15
2506                if m1 == 0xFF {
2507                    std::ptr::copy(ptr.add(ri + 8), ptr.add(wp + c0), 8);
2508                } else if m1 != 0 {
2509                    let src_v = _mm_loadl_epi64(ptr.add(ri + 8) as *const _);
2510                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m1 as usize].as_ptr() as *const _);
2511                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2512                    _mm_storel_epi64(ptr.add(wp + c0) as *mut _, out_v);
2513                }
2514
2515                // Sub-chunk 2: bytes 16-23
2516                if m2 == 0xFF {
2517                    std::ptr::copy(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
2518                } else if m2 != 0 {
2519                    let src_v = _mm_loadl_epi64(ptr.add(ri + 16) as *const _);
2520                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m2 as usize].as_ptr() as *const _);
2521                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2522                    _mm_storel_epi64(ptr.add(wp + c0 + c1) as *mut _, out_v);
2523                }
2524
2525                // Sub-chunk 3: bytes 24-31
2526                if m3 == 0xFF {
2527                    std::ptr::copy(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
2528                } else if m3 != 0 {
2529                    let src_v = _mm_loadl_epi64(ptr.add(ri + 24) as *const _);
2530                    let shuf = _mm_loadl_epi64(COMPACT_LUT[m3 as usize].as_ptr() as *const _);
2531                    let out_v = _mm_shuffle_epi8(src_v, shuf);
2532                    _mm_storel_epi64(ptr.add(wp + c0 + c1 + c2) as *mut _, out_v);
2533                }
2534
2535                wp += c0 + c1 + c2 + c3;
2536            }
2537            // del_mask == 0xFFFFFFFF: all deleted, skip entirely
2538            ri += 32;
2539        }
2540
2541        // Scalar tail
2542        while ri < n {
2543            let b = *ptr.add(ri);
2544            *ptr.add(wp) = b;
2545            wp += (b < lo || b > hi) as usize;
2546            ri += 1;
2547        }
2548
2549        wp
2550    }
2551}
2552
2553// ============================================================================
2554// Streaming functions (Read + Write)
2555// ============================================================================
2556
2557pub fn translate(
2558    set1: &[u8],
2559    set2: &[u8],
2560    reader: &mut impl Read,
2561    writer: &mut impl Write,
2562) -> io::Result<()> {
2563    let table = build_translate_table(set1, set2);
2564
2565    // Check for identity table — pure passthrough (no transformation needed)
2566    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
2567    if is_identity {
2568        return passthrough_stream(reader, writer);
2569    }
2570
2571    // Try SIMD fast path for constant-offset range translations (in-place, single buffer)
2572    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
2573        return translate_range_stream(lo, hi, offset, reader, writer);
2574    }
2575
2576    // Try SIMD fast path for range-to-constant translations (e.g., '\000-\037' -> 'X').
2577    // Uses blendv (5 SIMD ops/32 bytes) instead of nibble decomposition (48 ops/32 bytes).
2578    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
2579        return translate_range_to_constant_stream(lo, hi, replacement, reader, writer);
2580    }
2581
2582    // General case: IN-PLACE translation on a SINGLE buffer.
2583    // Fill the buffer fully before processing — SIMD translate is trivial
2584    // compared to syscall overhead, so fewer larger write_all() calls win.
2585    let mut buf = alloc_uninit_vec(STREAM_BUF);
2586    loop {
2587        let n = read_full(reader, &mut buf)?;
2588        if n == 0 {
2589            break;
2590        }
2591        translate_and_write_table(&mut buf, n, &table, writer)?;
2592    }
2593    Ok(())
2594}
2595
2596#[inline]
2597fn translate_and_write_table(
2598    buf: &mut [u8],
2599    total: usize,
2600    table: &[u8; 256],
2601    writer: &mut impl Write,
2602) -> io::Result<()> {
2603    translate_inplace(&mut buf[..total], table);
2604    writer.write_all(&buf[..total])
2605}
2606
2607/// Streaming SIMD range translation — single buffer, in-place transform.
2608/// Fills buffer fully before processing — SIMD translate is trivial compared
2609/// to syscall overhead, so fewer larger write_all() calls win.
2610fn translate_range_stream(
2611    lo: u8,
2612    hi: u8,
2613    offset: i8,
2614    reader: &mut impl Read,
2615    writer: &mut impl Write,
2616) -> io::Result<()> {
2617    let mut buf = alloc_uninit_vec(STREAM_BUF);
2618    loop {
2619        let n = read_full(reader, &mut buf)?;
2620        if n == 0 {
2621            break;
2622        }
2623        translate_and_write_range(&mut buf, n, lo, hi, offset, writer)?;
2624    }
2625    Ok(())
2626}
2627
2628#[inline]
2629fn translate_and_write_range(
2630    buf: &mut [u8],
2631    total: usize,
2632    lo: u8,
2633    hi: u8,
2634    offset: i8,
2635    writer: &mut impl Write,
2636) -> io::Result<()> {
2637    translate_range_simd_inplace(&mut buf[..total], lo, hi, offset);
2638    writer.write_all(&buf[..total])
2639}
2640
2641/// Streaming SIMD range-to-constant translation — single buffer, in-place transform.
2642/// Fills buffer fully before processing — fewer syscalls with larger write_all() calls.
2643/// Uses blendv instead of nibble decomposition for ~10x fewer SIMD ops per vector.
2644fn translate_range_to_constant_stream(
2645    lo: u8,
2646    hi: u8,
2647    replacement: u8,
2648    reader: &mut impl Read,
2649    writer: &mut impl Write,
2650) -> io::Result<()> {
2651    let mut buf = alloc_uninit_vec(STREAM_BUF);
2652    loop {
2653        let n = read_full(reader, &mut buf)?;
2654        if n == 0 {
2655            break;
2656        }
2657        translate_and_write_range_const(&mut buf, n, lo, hi, replacement, writer)?;
2658    }
2659    Ok(())
2660}
2661
2662#[inline]
2663fn translate_and_write_range_const(
2664    buf: &mut [u8],
2665    total: usize,
2666    lo: u8,
2667    hi: u8,
2668    replacement: u8,
2669    writer: &mut impl Write,
2670) -> io::Result<()> {
2671    translate_range_to_constant_simd_inplace(&mut buf[..total], lo, hi, replacement);
2672    writer.write_all(&buf[..total])
2673}
2674
2675/// Pure passthrough: copy stdin to stdout without transformation.
2676/// Uses a single 16MB uninit buffer with direct read/write, no processing overhead.
2677fn passthrough_stream(reader: &mut impl Read, writer: &mut impl Write) -> io::Result<()> {
2678    let mut buf = alloc_uninit_vec(STREAM_BUF);
2679    loop {
2680        let n = read_full(reader, &mut buf)?;
2681        if n == 0 {
2682            break;
2683        }
2684        writer.write_all(&buf[..n])?;
2685    }
2686    Ok(())
2687}
2688
2689/// Single-read for pipelining: process data immediately after first read()
2690/// instead of blocking to fill the entire buffer. This enables cat|ftr
2691/// pipelining: while ftr processes the first chunk, cat continues writing
2692/// to the pipe. For 10MB piped input with 8MB pipe buffer, this saves
2693/// ~0.5-1ms by overlapping cat's final writes with ftr's processing.
2694#[inline]
2695fn read_once(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2696    loop {
2697        match reader.read(buf) {
2698            Ok(n) => return Ok(n),
2699            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2700            Err(e) => return Err(e),
2701        }
2702    }
2703}
2704
2705/// Fill the buffer completely before processing. Unlike read_once which returns
2706/// after a single read() for pipelining, this loops until the buffer is full or
2707/// EOF is reached. Used by translate paths where the SIMD translation cost is
2708/// trivial compared to syscall overhead — fewer, larger write_all() calls win.
2709#[inline]
2710fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2711    let mut total = 0;
2712    while total < buf.len() {
2713        match reader.read(&mut buf[total..]) {
2714            Ok(0) => break,
2715            Ok(n) => total += n,
2716            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2717            Err(e) => return Err(e),
2718        }
2719    }
2720    Ok(total)
2721}
2722
2723pub fn translate_squeeze(
2724    set1: &[u8],
2725    set2: &[u8],
2726    reader: &mut impl Read,
2727    writer: &mut impl Write,
2728) -> io::Result<()> {
2729    let table = build_translate_table(set1, set2);
2730    let squeeze_set = build_member_set(set2);
2731
2732    // For single-char squeeze set with range-to-constant translation, use
2733    // fused approach: translate via SIMD, then use memmem to find squeeze points.
2734    if set2.len() == 1 || (set2.len() > 1 && set2.iter().all(|&b| b == set2[0])) {
2735        let squeeze_ch = set2.last().copied().unwrap_or(0);
2736        return translate_squeeze_single_ch(&table, squeeze_ch, &squeeze_set, reader, writer);
2737    }
2738
2739    // Two-pass optimization for range translations:
2740    // Pass 1: SIMD range translate in-place (10x faster than scalar table lookup)
2741    // Pass 2: scalar squeeze (inherently sequential due to state dependency)
2742    let range_info = detect_range_offset(&table);
2743    let range_const_info = if range_info.is_none() {
2744        detect_range_to_constant(&table)
2745    } else {
2746        None
2747    };
2748
2749    let mut buf = alloc_uninit_vec(STREAM_BUF);
2750    let mut last_squeezed: u16 = 256;
2751
2752    loop {
2753        let n = read_once(reader, &mut buf)?;
2754        if n == 0 {
2755            break;
2756        }
2757        let wp = translate_squeeze_process(
2758            &mut buf,
2759            n,
2760            &table,
2761            &squeeze_set,
2762            range_info,
2763            range_const_info,
2764            &mut last_squeezed,
2765        );
2766        if wp > 0 {
2767            writer.write_all(&buf[..wp])?;
2768        }
2769    }
2770    Ok(())
2771}
2772
2773#[inline]
2774fn translate_squeeze_process(
2775    buf: &mut [u8],
2776    n: usize,
2777    table: &[u8; 256],
2778    squeeze_set: &[u8; 32],
2779    range_info: Option<(u8, u8, i8)>,
2780    range_const_info: Option<(u8, u8, u8)>,
2781    last_squeezed: &mut u16,
2782) -> usize {
2783    // Pass 1: translate
2784    if let Some((lo, hi, offset)) = range_info {
2785        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2786    } else if let Some((lo, hi, replacement)) = range_const_info {
2787        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2788    } else {
2789        translate_inplace(&mut buf[..n], table);
2790    }
2791    // Pass 2: squeeze in-place
2792    let mut wp = 0;
2793    unsafe {
2794        let ptr = buf.as_mut_ptr();
2795        let mut i = 0;
2796        while i + 8 <= n {
2797            macro_rules! squeeze_byte {
2798                ($off:expr) => {
2799                    let b = *ptr.add(i + $off);
2800                    if is_member(squeeze_set, b) {
2801                        if *last_squeezed != b as u16 {
2802                            *last_squeezed = b as u16;
2803                            *ptr.add(wp) = b;
2804                            wp += 1;
2805                        }
2806                    } else {
2807                        *last_squeezed = 256;
2808                        *ptr.add(wp) = b;
2809                        wp += 1;
2810                    }
2811                };
2812            }
2813            squeeze_byte!(0);
2814            squeeze_byte!(1);
2815            squeeze_byte!(2);
2816            squeeze_byte!(3);
2817            squeeze_byte!(4);
2818            squeeze_byte!(5);
2819            squeeze_byte!(6);
2820            squeeze_byte!(7);
2821            i += 8;
2822        }
2823        while i < n {
2824            let b = *ptr.add(i);
2825            if is_member(squeeze_set, b) {
2826                if *last_squeezed == b as u16 {
2827                    i += 1;
2828                    continue;
2829                }
2830                *last_squeezed = b as u16;
2831            } else {
2832                *last_squeezed = 256;
2833            }
2834            *ptr.add(wp) = b;
2835            wp += 1;
2836            i += 1;
2837        }
2838    }
2839    wp
2840}
2841
2842/// Optimized translate+squeeze for single squeeze character.
2843/// After SIMD translation, uses memmem to find consecutive pairs
2844/// and compacts in-place with a single write_all per chunk.
2845fn translate_squeeze_single_ch(
2846    table: &[u8; 256],
2847    squeeze_ch: u8,
2848    _squeeze_set: &[u8; 32],
2849    reader: &mut impl Read,
2850    writer: &mut impl Write,
2851) -> io::Result<()> {
2852    let range_info = detect_range_offset(table);
2853    let range_const_info = if range_info.is_none() {
2854        detect_range_to_constant(table)
2855    } else {
2856        None
2857    };
2858
2859    let pair = [squeeze_ch, squeeze_ch];
2860    let finder = memchr::memmem::Finder::new(&pair);
2861    let mut buf = alloc_uninit_vec(STREAM_BUF);
2862    let mut was_squeeze_char = false;
2863
2864    loop {
2865        let n = read_once(reader, &mut buf)?;
2866        if n == 0 {
2867            break;
2868        }
2869        let wp = translate_squeeze_single_process(
2870            &mut buf,
2871            n,
2872            table,
2873            squeeze_ch,
2874            &finder,
2875            range_info,
2876            range_const_info,
2877            &mut was_squeeze_char,
2878        );
2879        if wp > 0 {
2880            writer.write_all(&buf[..wp])?;
2881        }
2882    }
2883    Ok(())
2884}
2885
2886#[inline]
2887fn translate_squeeze_single_process(
2888    buf: &mut [u8],
2889    n: usize,
2890    table: &[u8; 256],
2891    squeeze_ch: u8,
2892    finder: &memchr::memmem::Finder<'_>,
2893    range_info: Option<(u8, u8, i8)>,
2894    range_const_info: Option<(u8, u8, u8)>,
2895    was_squeeze_char: &mut bool,
2896) -> usize {
2897    // Pass 1: translate in-place
2898    if let Some((lo, hi, offset)) = range_info {
2899        translate_range_simd_inplace(&mut buf[..n], lo, hi, offset);
2900    } else if let Some((lo, hi, replacement)) = range_const_info {
2901        translate_range_to_constant_simd_inplace(&mut buf[..n], lo, hi, replacement);
2902    } else {
2903        translate_inplace(&mut buf[..n], table);
2904    }
2905
2906    // Pass 2: squeeze compaction
2907    let mut i = 0;
2908    if *was_squeeze_char {
2909        while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2910            i += 1;
2911        }
2912        *was_squeeze_char = false;
2913        if i >= n {
2914            *was_squeeze_char = true;
2915            return 0;
2916        }
2917    }
2918
2919    let ptr = buf.as_mut_ptr();
2920    let mut wp = 0usize;
2921
2922    loop {
2923        match finder.find(&buf[i..n]) {
2924            Some(offset) => {
2925                let seg_end = i + offset + 1;
2926                let gap = seg_end - i;
2927                if gap > 0 {
2928                    if wp != i {
2929                        unsafe {
2930                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
2931                        }
2932                    }
2933                    wp += gap;
2934                }
2935                i = seg_end;
2936                while i < n && unsafe { *buf.as_ptr().add(i) } == squeeze_ch {
2937                    i += 1;
2938                }
2939                if i >= n {
2940                    *was_squeeze_char = true;
2941                    break;
2942                }
2943            }
2944            None => {
2945                let rem = n - i;
2946                if rem > 0 {
2947                    if wp != i {
2948                        unsafe {
2949                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
2950                        }
2951                    }
2952                    wp += rem;
2953                }
2954                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == squeeze_ch;
2955                break;
2956            }
2957        }
2958    }
2959    wp
2960}
2961
2962pub fn delete(
2963    delete_chars: &[u8],
2964    reader: &mut impl Read,
2965    writer: &mut impl Write,
2966) -> io::Result<()> {
2967    if delete_chars.len() == 1 {
2968        return delete_single_streaming(delete_chars[0], reader, writer);
2969    }
2970    if delete_chars.len() <= 3 {
2971        return delete_multi_streaming(delete_chars, reader, writer);
2972    }
2973
2974    // SIMD fast path: if all delete chars form a contiguous range [lo..=hi],
2975    // use vectorized range comparison instead of scalar bitset lookup.
2976    // This covers [:digit:] (0x30-0x39), a-z, A-Z, etc.
2977    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
2978        return delete_range_streaming(lo, hi, reader, writer);
2979    }
2980
2981    let member = build_member_set(delete_chars);
2982    let mut buf = alloc_uninit_vec(STREAM_BUF);
2983    // Separate output buffer for SIMD compaction — keeps source data intact
2984    // while compact_8bytes_simd writes to a different location.
2985    let mut outbuf = alloc_uninit_vec(STREAM_BUF);
2986
2987    loop {
2988        let n = read_full(reader, &mut buf)?;
2989        if n == 0 {
2990            break;
2991        }
2992        let wp = delete_bitset_dispatch(&buf[..n], &mut outbuf, &member);
2993        if wp > 0 {
2994            writer.write_all(&outbuf[..wp])?;
2995        }
2996    }
2997    Ok(())
2998}
2999
3000#[inline]
3001fn delete_bitset_dispatch(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3002    #[cfg(target_arch = "x86_64")]
3003    {
3004        if get_simd_level() >= 3 {
3005            return unsafe { delete_bitset_avx2_stream(src, dst, member) };
3006        }
3007    }
3008    delete_bitset_scalar(src, dst, member)
3009}
3010
3011/// Scalar bitset delete: write kept bytes to output buffer.
3012#[inline]
3013fn delete_bitset_scalar(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3014    let n = src.len();
3015    let mut wp = 0;
3016    unsafe {
3017        let sp = src.as_ptr();
3018        let dp = dst.as_mut_ptr();
3019        let mut i = 0;
3020        while i + 8 <= n {
3021            let b0 = *sp.add(i);
3022            let b1 = *sp.add(i + 1);
3023            let b2 = *sp.add(i + 2);
3024            let b3 = *sp.add(i + 3);
3025            let b4 = *sp.add(i + 4);
3026            let b5 = *sp.add(i + 5);
3027            let b6 = *sp.add(i + 6);
3028            let b7 = *sp.add(i + 7);
3029            *dp.add(wp) = b0;
3030            wp += !is_member(member, b0) as usize;
3031            *dp.add(wp) = b1;
3032            wp += !is_member(member, b1) as usize;
3033            *dp.add(wp) = b2;
3034            wp += !is_member(member, b2) as usize;
3035            *dp.add(wp) = b3;
3036            wp += !is_member(member, b3) as usize;
3037            *dp.add(wp) = b4;
3038            wp += !is_member(member, b4) as usize;
3039            *dp.add(wp) = b5;
3040            wp += !is_member(member, b5) as usize;
3041            *dp.add(wp) = b6;
3042            wp += !is_member(member, b6) as usize;
3043            *dp.add(wp) = b7;
3044            wp += !is_member(member, b7) as usize;
3045            i += 8;
3046        }
3047        while i < n {
3048            let b = *sp.add(i);
3049            *dp.add(wp) = b;
3050            wp += !is_member(member, b) as usize;
3051            i += 1;
3052        }
3053    }
3054    wp
3055}
3056
3057/// AVX2 bitset delete for streaming: uses SIMD to check 32 bytes against the
3058/// membership bitset at once, then compact_8bytes_simd to pack kept bytes.
3059#[cfg(target_arch = "x86_64")]
3060#[target_feature(enable = "avx2")]
3061unsafe fn delete_bitset_avx2_stream(src: &[u8], dst: &mut [u8], member: &[u8; 32]) -> usize {
3062    use std::arch::x86_64::*;
3063
3064    unsafe {
3065        let n = src.len();
3066        let sp = src.as_ptr();
3067        let dp = dst.as_mut_ptr();
3068        let mut ri = 0;
3069        let mut wp = 0;
3070
3071        // Load the 256-bit membership bitset into an AVX2 register.
3072        // Byte i of member_v has bits set for characters in [i*8..i*8+7].
3073        let member_v = _mm256_loadu_si256(member.as_ptr() as *const _);
3074
3075        // For each input byte B, we check: member[B >> 3] & (1 << (B & 7))
3076        // Using SIMD: extract byte index (B >> 3) and bit position (B & 7).
3077        let mask7 = _mm256_set1_epi8(7);
3078        let mask_0x1f = _mm256_set1_epi8(0x1F_u8 as i8);
3079
3080        // Lookup table for (1 << (x & 7)) — pshufb gives per-byte shift
3081        // that _mm256_sllv_epi32 can't do (it works on 32-bit lanes).
3082        let bit_table = _mm256_setr_epi8(
3083            1, 2, 4, 8, 16, 32, 64, -128i8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 4, 8, 16, 32, 64, -128i8,
3084            0, 0, 0, 0, 0, 0, 0, 0,
3085        );
3086
3087        while ri + 32 <= n {
3088            let input = _mm256_loadu_si256(sp.add(ri) as *const _);
3089
3090            // byte_idx = input >> 3 (which byte of the 32-byte member set)
3091            let byte_idx = _mm256_and_si256(_mm256_srli_epi16(input, 3), mask_0x1f);
3092            // bit_pos = input & 7 (which bit within that byte)
3093            let bit_pos = _mm256_and_si256(input, mask7);
3094            // bit_mask = 1 << bit_pos (per-byte via shuffle lookup)
3095            let bit_mask = _mm256_shuffle_epi8(bit_table, bit_pos);
3096
3097            // member_byte = shuffle member_v by byte_idx (pshufb)
3098            // But pshufb only works within 128-bit lanes. We need cross-lane.
3099            // Since member is 32 bytes and byte_idx can be 0-31, we need
3100            // a different approach. Use two pshufb + blend:
3101            // lo_half = pshufb(member[0..15], byte_idx)
3102            // hi_half = pshufb(member[16..31], byte_idx - 16)
3103            // select = byte_idx >= 16
3104            let member_lo = _mm256_broadcastsi128_si256(_mm256_castsi256_si128(member_v));
3105            let member_hi = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(member_v, 1));
3106            let lo_mask = _mm256_set1_epi8(0x0F);
3107            let idx_lo = _mm256_and_si256(byte_idx, lo_mask);
3108            let shuffled_lo = _mm256_shuffle_epi8(member_lo, idx_lo);
3109            let shuffled_hi = _mm256_shuffle_epi8(member_hi, idx_lo);
3110            // select hi when byte_idx >= 16 (bit 4 set)
3111            let use_hi = _mm256_slli_epi16(byte_idx, 3); // shift bit 4 to bit 7
3112            let member_byte = _mm256_blendv_epi8(shuffled_lo, shuffled_hi, use_hi);
3113
3114            // Check: (member_byte & bit_mask) != 0 → byte is in delete set
3115            let test = _mm256_and_si256(member_byte, bit_mask);
3116            let is_zero = _mm256_cmpeq_epi8(test, _mm256_setzero_si256());
3117            // keep_mask: bit set = byte should be KEPT (not in delete set)
3118            let keep_mask = _mm256_movemask_epi8(is_zero) as u32;
3119
3120            if keep_mask == 0xFFFFFFFF {
3121                // All 32 bytes kept — bulk copy
3122                std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 32);
3123                wp += 32;
3124            } else if keep_mask != 0 {
3125                // Partial keep — compact 8 bytes at a time
3126                let m0 = keep_mask as u8;
3127                let m1 = (keep_mask >> 8) as u8;
3128                let m2 = (keep_mask >> 16) as u8;
3129                let m3 = (keep_mask >> 24) as u8;
3130
3131                if m0 == 0xFF {
3132                    std::ptr::copy_nonoverlapping(sp.add(ri), dp.add(wp), 8);
3133                } else if m0 != 0 {
3134                    compact_8bytes_simd(sp.add(ri), dp.add(wp), m0);
3135                }
3136                let c0 = m0.count_ones() as usize;
3137
3138                if m1 == 0xFF {
3139                    std::ptr::copy_nonoverlapping(sp.add(ri + 8), dp.add(wp + c0), 8);
3140                } else if m1 != 0 {
3141                    compact_8bytes_simd(sp.add(ri + 8), dp.add(wp + c0), m1);
3142                }
3143                let c1 = m1.count_ones() as usize;
3144
3145                if m2 == 0xFF {
3146                    std::ptr::copy_nonoverlapping(sp.add(ri + 16), dp.add(wp + c0 + c1), 8);
3147                } else if m2 != 0 {
3148                    compact_8bytes_simd(sp.add(ri + 16), dp.add(wp + c0 + c1), m2);
3149                }
3150                let c2 = m2.count_ones() as usize;
3151
3152                if m3 == 0xFF {
3153                    std::ptr::copy_nonoverlapping(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), 8);
3154                } else if m3 != 0 {
3155                    compact_8bytes_simd(sp.add(ri + 24), dp.add(wp + c0 + c1 + c2), m3);
3156                }
3157                let c3 = m3.count_ones() as usize;
3158                wp += c0 + c1 + c2 + c3;
3159            }
3160            // else: all 32 bytes deleted, wp unchanged
3161            ri += 32;
3162        }
3163
3164        // Scalar tail
3165        while ri < n {
3166            let b = *sp.add(ri);
3167            *dp.add(wp) = b;
3168            wp += !is_member(member, b) as usize;
3169            ri += 1;
3170        }
3171
3172        wp
3173    }
3174}
3175
3176fn delete_single_streaming(
3177    ch: u8,
3178    reader: &mut impl Read,
3179    writer: &mut impl Write,
3180) -> io::Result<()> {
3181    let mut buf = alloc_uninit_vec(STREAM_BUF);
3182    loop {
3183        let n = read_full(reader, &mut buf)?;
3184        if n == 0 {
3185            break;
3186        }
3187        let wp = delete_single_inplace(&mut buf, n, ch);
3188        if wp > 0 {
3189            writer.write_all(&buf[..wp])?;
3190        }
3191    }
3192    Ok(())
3193}
3194
3195/// In-place single-char delete using memchr gap-copy.
3196#[inline]
3197fn delete_single_inplace(buf: &mut [u8], n: usize, ch: u8) -> usize {
3198    let mut wp = 0;
3199    let mut i = 0;
3200    while i < n {
3201        match memchr::memchr(ch, &buf[i..n]) {
3202            Some(offset) => {
3203                if offset > 0 {
3204                    if wp != i {
3205                        unsafe {
3206                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3207                        }
3208                    }
3209                    wp += offset;
3210                }
3211                i += offset + 1;
3212            }
3213            None => {
3214                let run_len = n - i;
3215                if run_len > 0 {
3216                    if wp != i {
3217                        unsafe {
3218                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3219                        }
3220                    }
3221                    wp += run_len;
3222                }
3223                break;
3224            }
3225        }
3226    }
3227    wp
3228}
3229
3230fn delete_multi_streaming(
3231    chars: &[u8],
3232    reader: &mut impl Read,
3233    writer: &mut impl Write,
3234) -> io::Result<()> {
3235    let mut buf = alloc_uninit_vec(STREAM_BUF);
3236    loop {
3237        let n = read_full(reader, &mut buf)?;
3238        if n == 0 {
3239            break;
3240        }
3241        let wp = delete_multi_inplace(&mut buf, n, chars);
3242        if wp > 0 {
3243            writer.write_all(&buf[..wp])?;
3244        }
3245    }
3246    Ok(())
3247}
3248
3249/// In-place multi-char delete using memchr2/memchr3 gap-copy.
3250#[inline]
3251fn delete_multi_inplace(buf: &mut [u8], n: usize, chars: &[u8]) -> usize {
3252    let mut wp = 0;
3253    let mut i = 0;
3254    while i < n {
3255        let found = if chars.len() == 2 {
3256            memchr::memchr2(chars[0], chars[1], &buf[i..n])
3257        } else {
3258            memchr::memchr3(chars[0], chars[1], chars[2], &buf[i..n])
3259        };
3260        match found {
3261            Some(offset) => {
3262                if offset > 0 {
3263                    if wp != i {
3264                        unsafe {
3265                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), offset);
3266                        }
3267                    }
3268                    wp += offset;
3269                }
3270                i += offset + 1;
3271            }
3272            None => {
3273                let run_len = n - i;
3274                if run_len > 0 {
3275                    if wp != i {
3276                        unsafe {
3277                            std::ptr::copy(buf.as_ptr().add(i), buf.as_mut_ptr().add(wp), run_len);
3278                        }
3279                    }
3280                    wp += run_len;
3281                }
3282                break;
3283            }
3284        }
3285    }
3286    wp
3287}
3288
3289pub fn delete_squeeze(
3290    delete_chars: &[u8],
3291    squeeze_chars: &[u8],
3292    reader: &mut impl Read,
3293    writer: &mut impl Write,
3294) -> io::Result<()> {
3295    let delete_set = build_member_set(delete_chars);
3296    let squeeze_set = build_member_set(squeeze_chars);
3297    let mut buf = alloc_uninit_vec(STREAM_BUF);
3298    let mut last_squeezed: u16 = 256;
3299
3300    loop {
3301        let n = read_once(reader, &mut buf)?;
3302        if n == 0 {
3303            break;
3304        }
3305        let wp = delete_squeeze_inplace(&mut buf, n, &delete_set, &squeeze_set, &mut last_squeezed);
3306        if wp > 0 {
3307            writer.write_all(&buf[..wp])?;
3308        }
3309    }
3310    Ok(())
3311}
3312
3313#[inline]
3314fn delete_squeeze_inplace(
3315    buf: &mut [u8],
3316    n: usize,
3317    delete_set: &[u8; 32],
3318    squeeze_set: &[u8; 32],
3319    last_squeezed: &mut u16,
3320) -> usize {
3321    let mut wp = 0;
3322    unsafe {
3323        let ptr = buf.as_mut_ptr();
3324        let mut i = 0;
3325        while i + 8 <= n {
3326            macro_rules! process_byte {
3327                ($off:expr) => {
3328                    let b = *ptr.add(i + $off);
3329                    if !is_member(delete_set, b) {
3330                        if is_member(squeeze_set, b) {
3331                            if *last_squeezed != b as u16 {
3332                                *last_squeezed = b as u16;
3333                                *ptr.add(wp) = b;
3334                                wp += 1;
3335                            }
3336                        } else {
3337                            *last_squeezed = 256;
3338                            *ptr.add(wp) = b;
3339                            wp += 1;
3340                        }
3341                    }
3342                };
3343            }
3344            process_byte!(0);
3345            process_byte!(1);
3346            process_byte!(2);
3347            process_byte!(3);
3348            process_byte!(4);
3349            process_byte!(5);
3350            process_byte!(6);
3351            process_byte!(7);
3352            i += 8;
3353        }
3354        while i < n {
3355            let b = *ptr.add(i);
3356            if !is_member(delete_set, b) {
3357                if is_member(squeeze_set, b) {
3358                    if *last_squeezed != b as u16 {
3359                        *last_squeezed = b as u16;
3360                        *ptr.add(wp) = b;
3361                        wp += 1;
3362                    }
3363                } else {
3364                    *last_squeezed = 256;
3365                    *ptr.add(wp) = b;
3366                    wp += 1;
3367                }
3368            }
3369            i += 1;
3370        }
3371    }
3372    wp
3373}
3374
3375pub fn squeeze(
3376    squeeze_chars: &[u8],
3377    reader: &mut impl Read,
3378    writer: &mut impl Write,
3379) -> io::Result<()> {
3380    if squeeze_chars.len() == 1 {
3381        return squeeze_single_stream(squeeze_chars[0], reader, writer);
3382    }
3383
3384    // For 2-3 squeeze chars, use memchr2/memchr3-based gap-copy
3385    // which gives SIMD-accelerated scanning instead of byte-at-a-time.
3386    if squeeze_chars.len() <= 3 {
3387        return squeeze_multi_stream(squeeze_chars, reader, writer);
3388    }
3389
3390    let member = build_member_set(squeeze_chars);
3391    let mut buf = alloc_uninit_vec(STREAM_BUF);
3392    let mut last_squeezed: u16 = 256;
3393
3394    loop {
3395        let n = read_once(reader, &mut buf)?;
3396        if n == 0 {
3397            break;
3398        }
3399        let wp = squeeze_inplace_bitset(&mut buf, n, &member, &mut last_squeezed);
3400        if wp > 0 {
3401            writer.write_all(&buf[..wp])?;
3402        }
3403    }
3404    Ok(())
3405}
3406
3407#[inline]
3408fn squeeze_inplace_bitset(
3409    buf: &mut [u8],
3410    n: usize,
3411    member: &[u8; 32],
3412    last_squeezed: &mut u16,
3413) -> usize {
3414    let mut wp = 0;
3415    unsafe {
3416        let ptr = buf.as_mut_ptr();
3417        for i in 0..n {
3418            let b = *ptr.add(i);
3419            if is_member(member, b) {
3420                if *last_squeezed == b as u16 {
3421                    continue;
3422                }
3423                *last_squeezed = b as u16;
3424            } else {
3425                *last_squeezed = 256;
3426            }
3427            *ptr.add(wp) = b;
3428            wp += 1;
3429        }
3430    }
3431    wp
3432}
3433
3434/// Streaming squeeze for 2-3 chars using memchr2/memchr3 SIMD scanning.
3435/// Builds writev IoSlice entries pointing into the read buffer, skipping
3436/// duplicate runs of squeezable characters. Zero-copy between squeeze points.
3437fn squeeze_multi_stream(
3438    chars: &[u8],
3439    reader: &mut impl Read,
3440    writer: &mut impl Write,
3441) -> io::Result<()> {
3442    let c0 = chars[0];
3443    let c1 = chars[1];
3444    let c2 = if chars.len() >= 3 {
3445        Some(chars[2])
3446    } else {
3447        None
3448    };
3449
3450    let mut buf = alloc_uninit_vec(STREAM_BUF);
3451    let mut last_squeezed: u16 = 256;
3452
3453    loop {
3454        let n = read_once(reader, &mut buf)?;
3455        if n == 0 {
3456            break;
3457        }
3458        let wp = squeeze_multi_compact(&mut buf, n, c0, c1, c2, &mut last_squeezed);
3459        if wp > 0 {
3460            writer.write_all(&buf[..wp])?;
3461        }
3462    }
3463    Ok(())
3464}
3465
3466/// In-place multi-char squeeze using memchr2/memchr3 gap-copy.
3467#[inline]
3468fn squeeze_multi_compact(
3469    buf: &mut [u8],
3470    n: usize,
3471    c0: u8,
3472    c1: u8,
3473    c2: Option<u8>,
3474    last_squeezed: &mut u16,
3475) -> usize {
3476    let ptr = buf.as_mut_ptr();
3477    let mut wp = 0usize;
3478    let mut cursor = 0usize;
3479
3480    while cursor < n {
3481        let found = if let Some(c) = c2 {
3482            memchr::memchr3(c0, c1, c, &buf[cursor..n])
3483        } else {
3484            memchr::memchr2(c0, c1, &buf[cursor..n])
3485        };
3486        match found {
3487            Some(offset) => {
3488                let pos = cursor + offset;
3489                let b = unsafe { *ptr.add(pos) };
3490
3491                let gap = pos - cursor;
3492                if gap > 0 {
3493                    if wp != cursor {
3494                        unsafe {
3495                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), gap);
3496                        }
3497                    }
3498                    wp += gap;
3499                    *last_squeezed = 256;
3500                }
3501
3502                if *last_squeezed != b as u16 {
3503                    unsafe { *ptr.add(wp) = b };
3504                    wp += 1;
3505                    *last_squeezed = b as u16;
3506                }
3507
3508                cursor = pos + 1;
3509                while cursor < n && unsafe { *ptr.add(cursor) } == b {
3510                    cursor += 1;
3511                }
3512            }
3513            None => {
3514                let rem = n - cursor;
3515                if rem > 0 {
3516                    if wp != cursor {
3517                        unsafe {
3518                            std::ptr::copy(ptr.add(cursor), ptr.add(wp), rem);
3519                        }
3520                    }
3521                    wp += rem;
3522                    *last_squeezed = 256;
3523                }
3524                break;
3525            }
3526        }
3527    }
3528    wp
3529}
3530
3531fn squeeze_single_stream(
3532    ch: u8,
3533    reader: &mut impl Read,
3534    writer: &mut impl Write,
3535) -> io::Result<()> {
3536    let mut buf = alloc_uninit_vec(STREAM_BUF);
3537    let mut was_squeeze_char = false;
3538
3539    // AVX2 path: process 32 bytes at a time with SIMD compaction
3540    #[cfg(target_arch = "x86_64")]
3541    if get_simd_level() >= 3 {
3542        loop {
3543            let n = read_once(reader, &mut buf)?;
3544            if n == 0 {
3545                break;
3546            }
3547            let wp = unsafe { squeeze_single_avx2_inplace(&mut buf, n, ch, &mut was_squeeze_char) };
3548            if wp > 0 {
3549                writer.write_all(&buf[..wp])?;
3550            }
3551        }
3552        return Ok(());
3553    }
3554
3555    // Fallback: memmem-based approach
3556    let pair = [ch, ch];
3557    let finder = memchr::memmem::Finder::new(&pair);
3558    loop {
3559        let n = read_once(reader, &mut buf)?;
3560        if n == 0 {
3561            break;
3562        }
3563        let wp = squeeze_single_compact(&mut buf, n, ch, &finder, &mut was_squeeze_char);
3564        if wp > 0 {
3565            writer.write_all(&buf[..wp])?;
3566        }
3567    }
3568    Ok(())
3569}
3570
3571/// In-place squeeze compaction for single-char using memmem.
3572#[inline]
3573fn squeeze_single_compact(
3574    buf: &mut [u8],
3575    n: usize,
3576    ch: u8,
3577    finder: &memchr::memmem::Finder<'_>,
3578    was_squeeze_char: &mut bool,
3579) -> usize {
3580    let mut i = 0;
3581
3582    // Handle carry-over from previous flush
3583    if *was_squeeze_char {
3584        while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3585            i += 1;
3586        }
3587        *was_squeeze_char = false;
3588        if i >= n {
3589            *was_squeeze_char = true;
3590            return 0;
3591        }
3592    }
3593
3594    let ptr = buf.as_mut_ptr();
3595    let mut wp = 0usize;
3596
3597    loop {
3598        match finder.find(&buf[i..n]) {
3599            Some(offset) => {
3600                let seg_end = i + offset + 1;
3601                let gap = seg_end - i;
3602                if gap > 0 {
3603                    if wp != i {
3604                        unsafe {
3605                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), gap);
3606                        }
3607                    }
3608                    wp += gap;
3609                }
3610                i = seg_end;
3611                while i < n && unsafe { *buf.as_ptr().add(i) } == ch {
3612                    i += 1;
3613                }
3614                if i >= n {
3615                    *was_squeeze_char = true;
3616                    break;
3617                }
3618            }
3619            None => {
3620                let rem = n - i;
3621                if rem > 0 {
3622                    if wp != i {
3623                        unsafe {
3624                            std::ptr::copy(ptr.add(i) as *const u8, ptr.add(wp), rem);
3625                        }
3626                    }
3627                    wp += rem;
3628                }
3629                *was_squeeze_char = n > 0 && unsafe { *buf.as_ptr().add(n - 1) } == ch;
3630                break;
3631            }
3632        }
3633    }
3634    wp
3635}
3636
3637/// AVX2-accelerated in-place squeeze compaction for a single character.
3638///
3639/// Processes 32 bytes at a time: compare all bytes against `ch`, build a mask
3640/// of consecutive duplicates (byte == ch AND previous byte == ch), then compact
3641/// using the same COMPACT_LUT + pshufb pattern as `delete_range_avx2`.
3642///
3643/// The `carry` bit tracks whether the last byte of the previous 32-byte block
3644/// was the squeeze char, enabling correct cross-block duplicate detection.
3645///
3646/// Safety: `buf[..n]` must be valid. wp <= ri always holds since we only remove bytes.
3647#[cfg(target_arch = "x86_64")]
3648#[target_feature(enable = "avx2")]
3649unsafe fn squeeze_single_avx2_inplace(
3650    buf: &mut [u8],
3651    n: usize,
3652    ch: u8,
3653    was_squeeze_char: &mut bool,
3654) -> usize {
3655    use std::arch::x86_64::*;
3656
3657    unsafe {
3658        let ch_v = _mm256_set1_epi8(ch as i8);
3659        let ptr = buf.as_mut_ptr();
3660        let mut ri = 0;
3661        let mut wp = 0;
3662        let mut carry: u32 = if *was_squeeze_char { 1 } else { 0 };
3663
3664        while ri + 32 <= n {
3665            let input = _mm256_loadu_si256(ptr.add(ri) as *const _);
3666            let cmp = _mm256_cmpeq_epi8(input, ch_v);
3667            let sq_mask = _mm256_movemask_epi8(cmp) as u32;
3668
3669            // prev_sq_mask: bit i set if byte i-1 was the squeeze char
3670            // (shift sq_mask left by 1, fill bit 0 from carry)
3671            let prev_sq_mask = (sq_mask << 1) | carry;
3672
3673            // remove_mask: bit set where byte IS ch AND previous byte WAS ch
3674            let remove_mask = sq_mask & prev_sq_mask;
3675
3676            // carry for next block: was the last byte (bit 31) the squeeze char?
3677            carry = (sq_mask >> 31) & 1;
3678
3679            if remove_mask == 0 {
3680                // No duplicates to remove — bulk copy
3681                if wp != ri {
3682                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 32);
3683                }
3684                wp += 32;
3685            } else if remove_mask != 0xFFFFFFFF {
3686                // Partial removal — per-lane compaction (same pattern as delete_range_avx2)
3687                let keep_mask = !remove_mask;
3688                let m0 = keep_mask as u8;
3689                let m1 = (keep_mask >> 8) as u8;
3690                let m2 = (keep_mask >> 16) as u8;
3691                let m3 = (keep_mask >> 24) as u8;
3692
3693                if m0 == 0xFF {
3694                    std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3695                } else if m0 != 0 {
3696                    compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3697                }
3698                let c0 = m0.count_ones() as usize;
3699
3700                if m1 == 0xFF {
3701                    std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3702                } else if m1 != 0 {
3703                    compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3704                }
3705                let c1 = m1.count_ones() as usize;
3706
3707                if m2 == 0xFF {
3708                    std::ptr::copy_nonoverlapping(ptr.add(ri + 16), ptr.add(wp + c0 + c1), 8);
3709                } else if m2 != 0 {
3710                    compact_8bytes_simd(ptr.add(ri + 16), ptr.add(wp + c0 + c1), m2);
3711                }
3712                let c2 = m2.count_ones() as usize;
3713
3714                if m3 == 0xFF {
3715                    std::ptr::copy_nonoverlapping(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), 8);
3716                } else if m3 != 0 {
3717                    compact_8bytes_simd(ptr.add(ri + 24), ptr.add(wp + c0 + c1 + c2), m3);
3718                }
3719                let c3 = m3.count_ones() as usize;
3720                wp += c0 + c1 + c2 + c3;
3721            }
3722            // else: remove_mask == 0xFFFFFFFF means all 32 bytes are duplicate squeeze chars, skip
3723
3724            ri += 32;
3725        }
3726
3727        // SSE2 tail for 16-byte remainder
3728        if ri + 16 <= n {
3729            let ch_v128 = _mm_set1_epi8(ch as i8);
3730            let input = _mm_loadu_si128(ptr.add(ri) as *const _);
3731            let cmp = _mm_cmpeq_epi8(input, ch_v128);
3732            let sq_mask = _mm_movemask_epi8(cmp) as u32 & 0xFFFF;
3733            let prev_sq_mask = (sq_mask << 1) | carry;
3734            let remove_mask = sq_mask & prev_sq_mask;
3735            carry = (sq_mask >> 15) & 1;
3736
3737            if remove_mask == 0 {
3738                if wp != ri {
3739                    std::ptr::copy(ptr.add(ri), ptr.add(wp), 16);
3740                }
3741                wp += 16;
3742            } else if remove_mask != 0xFFFF {
3743                let keep_mask = !remove_mask;
3744                let m0 = keep_mask as u8;
3745                let m1 = (keep_mask >> 8) as u8;
3746                if m0 == 0xFF {
3747                    std::ptr::copy_nonoverlapping(ptr.add(ri), ptr.add(wp), 8);
3748                } else if m0 != 0 {
3749                    compact_8bytes_simd(ptr.add(ri), ptr.add(wp), m0);
3750                }
3751                let c0 = m0.count_ones() as usize;
3752                if m1 == 0xFF {
3753                    std::ptr::copy_nonoverlapping(ptr.add(ri + 8), ptr.add(wp + c0), 8);
3754                } else if m1 != 0 {
3755                    compact_8bytes_simd(ptr.add(ri + 8), ptr.add(wp + c0), m1);
3756                }
3757                wp += c0 + m1.count_ones() as usize;
3758            }
3759            ri += 16;
3760        }
3761
3762        // Scalar tail for remaining bytes
3763        while ri < n {
3764            let b = *ptr.add(ri);
3765            if b == ch && carry != 0 {
3766                // Duplicate squeeze char — skip it
3767            } else {
3768                *ptr.add(wp) = b;
3769                wp += 1;
3770            }
3771            carry = if b == ch { 1 } else { 0 };
3772            ri += 1;
3773        }
3774
3775        *was_squeeze_char = carry != 0;
3776        wp
3777    }
3778}
3779
3780// ============================================================================
3781// Batch in-place functions (owned data from piped stdin)
3782// ============================================================================
3783
3784/// Translate bytes in-place on an owned buffer, then write.
3785/// For piped stdin where we own the data, this avoids the separate output buffer
3786/// allocation needed by translate_mmap. Uses parallel in-place SIMD for large data.
3787pub fn translate_owned(
3788    set1: &[u8],
3789    set2: &[u8],
3790    data: &mut [u8],
3791    writer: &mut impl Write,
3792) -> io::Result<()> {
3793    let table = build_translate_table(set1, set2);
3794
3795    // Identity table — pure passthrough
3796    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3797    if is_identity {
3798        return writer.write_all(data);
3799    }
3800
3801    // SIMD range fast path (in-place) — single-threaded is memory-bandwidth
3802    // optimal; rayon thread-dispatch overhead exceeds any multi-core benefit.
3803    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3804        translate_range_simd_inplace(data, lo, hi, offset);
3805        return writer.write_all(data);
3806    }
3807
3808    // SIMD range-to-constant fast path (in-place)
3809    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3810        translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3811        return writer.write_all(data);
3812    }
3813
3814    // General table lookup (in-place)
3815    translate_inplace(data, &table);
3816    writer.write_all(data)
3817}
3818
3819// ============================================================================
3820// Mmap-based functions (zero-copy input from byte slice)
3821// ============================================================================
3822
3823/// Translate bytes from an mmap'd byte slice.
3824/// Detects single-range translations (e.g., a-z to A-Z) and uses SIMD vectorized
3825/// arithmetic (AVX2: 32 bytes/iter, SSE2: 16 bytes/iter) for those cases.
3826/// Falls back to scalar 256-byte table lookup for general translations.
3827///
3828/// Uses 8MB chunked output to avoid large munmap overhead while keeping
3829/// write() syscalls infrequent. Single-threaded SIMD is memory-bandwidth
3830/// optimal — rayon dispatch overhead exceeds any benefit.
3831pub fn translate_mmap(
3832    set1: &[u8],
3833    set2: &[u8],
3834    data: &[u8],
3835    writer: &mut impl Write,
3836) -> io::Result<()> {
3837    let table = build_translate_table(set1, set2);
3838
3839    // Check if table is identity — pure passthrough
3840    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3841    if is_identity {
3842        return writer.write_all(data);
3843    }
3844
3845    // Try SIMD fast path for single-range constant-offset translations
3846    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3847        return translate_mmap_range(data, writer, lo, hi, offset);
3848    }
3849
3850    // Try SIMD fast path for range-to-constant translations
3851    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3852        return translate_mmap_range_to_constant(data, writer, lo, hi, replacement);
3853    }
3854
3855    // General case: table lookup (with parallel processing for large data)
3856    translate_mmap_table(data, writer, &table)
3857}
3858
3859/// SIMD range translate for mmap data — single-threaded chunked approach.
3860/// 8MB chunks avoid large munmap overhead while keeping syscalls infrequent.
3861fn translate_mmap_range(
3862    data: &[u8],
3863    writer: &mut impl Write,
3864    lo: u8,
3865    hi: u8,
3866    offset: i8,
3867) -> io::Result<()> {
3868    const CHUNK: usize = 8 * 1024 * 1024;
3869    let buf_size = data.len().min(CHUNK);
3870    let mut buf = alloc_uninit_vec(buf_size);
3871    for chunk in data.chunks(CHUNK) {
3872        translate_range_simd(chunk, &mut buf[..chunk.len()], lo, hi, offset);
3873        writer.write_all(&buf[..chunk.len()])?;
3874    }
3875    Ok(())
3876}
3877
3878/// SIMD range-to-constant translate for mmap data — single-threaded chunked.
3879/// Uses blendv (5 SIMD ops/32 bytes) for range-to-constant patterns.
3880/// 8MB chunks avoid large munmap overhead while keeping syscalls infrequent.
3881fn translate_mmap_range_to_constant(
3882    data: &[u8],
3883    writer: &mut impl Write,
3884    lo: u8,
3885    hi: u8,
3886    replacement: u8,
3887) -> io::Result<()> {
3888    const CHUNK: usize = 8 * 1024 * 1024;
3889    let buf_size = data.len().min(CHUNK);
3890    let mut buf = alloc_uninit_vec(buf_size);
3891    for chunk in data.chunks(CHUNK) {
3892        buf[..chunk.len()].copy_from_slice(chunk);
3893        translate_range_to_constant_simd_inplace(&mut buf[..chunk.len()], lo, hi, replacement);
3894        writer.write_all(&buf[..chunk.len()])?;
3895    }
3896    Ok(())
3897}
3898
3899/// General table-lookup translate for mmap data — single-threaded chunked.
3900fn translate_mmap_table(data: &[u8], writer: &mut impl Write, table: &[u8; 256]) -> io::Result<()> {
3901    const CHUNK: usize = 8 * 1024 * 1024;
3902    let buf_size = data.len().min(CHUNK);
3903    let mut buf = alloc_uninit_vec(buf_size);
3904    for chunk in data.chunks(CHUNK) {
3905        translate_to(chunk, &mut buf[..chunk.len()], table);
3906        writer.write_all(&buf[..chunk.len()])?;
3907    }
3908    Ok(())
3909}
3910
3911/// Translate bytes in-place on a mutable buffer (e.g., MAP_PRIVATE mmap).
3912/// Eliminates the output buffer allocation entirely — the kernel's COW
3913/// semantics mean only modified pages are physically copied.
3914/// Single-threaded SIMD is memory-bandwidth optimal.
3915pub fn translate_mmap_inplace(
3916    set1: &[u8],
3917    set2: &[u8],
3918    data: &mut [u8],
3919    writer: &mut impl Write,
3920) -> io::Result<()> {
3921    let table = build_translate_table(set1, set2);
3922
3923    // Check if table is identity — pure passthrough
3924    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3925    if is_identity {
3926        return writer.write_all(data);
3927    }
3928
3929    // Try SIMD fast path for single-range constant-offset translations (e.g., a-z -> A-Z)
3930    if let Some((lo, hi, offset)) = detect_range_offset(&table) {
3931        translate_range_simd_inplace(data, lo, hi, offset);
3932        return writer.write_all(data);
3933    }
3934
3935    // Try SIMD fast path for range-to-constant translations
3936    if let Some((lo, hi, replacement)) = detect_range_to_constant(&table) {
3937        translate_range_to_constant_simd_inplace(data, lo, hi, replacement);
3938        return writer.write_all(data);
3939    }
3940
3941    // General case: in-place table lookup
3942    translate_inplace(data, &table);
3943    writer.write_all(data)
3944}
3945
3946/// Translate from read-only source to a separate output buffer, avoiding COW faults.
3947/// Uses the appropriate SIMD path (range offset, range-to-constant, or general nibble).
3948/// 8MB chunked approach avoids large munmap overhead.
3949fn translate_to_separate_buf(
3950    data: &[u8],
3951    table: &[u8; 256],
3952    writer: &mut impl Write,
3953) -> io::Result<()> {
3954    let range_info = detect_range_offset(table);
3955    let const_info = if range_info.is_none() {
3956        detect_range_to_constant(table)
3957    } else {
3958        None
3959    };
3960
3961    const CHUNK: usize = 8 * 1024 * 1024;
3962    let buf_size = data.len().min(CHUNK);
3963    let mut out_buf = alloc_uninit_vec(buf_size);
3964    for chunk in data.chunks(CHUNK) {
3965        if let Some((lo, hi, offset)) = range_info {
3966            translate_range_simd(chunk, &mut out_buf[..chunk.len()], lo, hi, offset);
3967        } else if let Some((lo, hi, replacement)) = const_info {
3968            translate_range_to_constant_simd(
3969                chunk,
3970                &mut out_buf[..chunk.len()],
3971                lo,
3972                hi,
3973                replacement,
3974            );
3975        } else {
3976            translate_to(chunk, &mut out_buf[..chunk.len()], table);
3977        }
3978        writer.write_all(&out_buf[..chunk.len()])?;
3979    }
3980    Ok(())
3981}
3982
3983/// Translate from a read-only mmap (or any byte slice) to a separate output buffer.
3984/// Avoids MAP_PRIVATE COW page faults by reading from the original data and
3985/// writing to a freshly allocated heap buffer.
3986pub fn translate_mmap_readonly(
3987    set1: &[u8],
3988    set2: &[u8],
3989    data: &[u8],
3990    writer: &mut impl Write,
3991) -> io::Result<()> {
3992    let table = build_translate_table(set1, set2);
3993
3994    // Check if table is identity — pure passthrough
3995    let is_identity = table.iter().enumerate().all(|(i, &v)| v == i as u8);
3996    if is_identity {
3997        return writer.write_all(data);
3998    }
3999
4000    translate_to_separate_buf(data, &table, writer)
4001}
4002
4003/// Translate + squeeze from mmap'd byte slice.
4004/// Single-threaded translate + in-place squeeze for memory-bandwidth optimality.
4005pub fn translate_squeeze_mmap(
4006    set1: &[u8],
4007    set2: &[u8],
4008    data: &[u8],
4009    writer: &mut impl Write,
4010) -> io::Result<()> {
4011    let table = build_translate_table(set1, set2);
4012    let squeeze_set = build_member_set(set2);
4013
4014    // Two-phase for data that fits in memory:
4015    // Phase 1: translate into buffer
4016    // Phase 2: squeeze in-place (squeeze only removes, never grows)
4017    if data.len() <= SINGLE_ALLOC_LIMIT {
4018        let mut translated = alloc_uninit_vec(data.len());
4019        let range_info = detect_range_offset(&table);
4020
4021        if let Some((lo, hi, offset)) = range_info {
4022            translate_range_simd(data, &mut translated, lo, hi, offset);
4023        } else if let Some((lo, hi, repl)) = detect_range_to_constant(&table) {
4024            translate_range_to_constant_simd(data, &mut translated, lo, hi, repl);
4025        } else {
4026            translate_to(data, &mut translated, &table);
4027        }
4028
4029        // Phase 2: squeeze in-place on the translated buffer.
4030        let mut last_squeezed: u16 = 256;
4031        let len = translated.len();
4032        let mut wp = 0;
4033        unsafe {
4034            let ptr = translated.as_mut_ptr();
4035            let mut i = 0;
4036            while i < len {
4037                let b = *ptr.add(i);
4038                if is_member(&squeeze_set, b) {
4039                    if last_squeezed == b as u16 {
4040                        i += 1;
4041                        continue;
4042                    }
4043                    last_squeezed = b as u16;
4044                } else {
4045                    last_squeezed = 256;
4046                }
4047                *ptr.add(wp) = b;
4048                wp += 1;
4049                i += 1;
4050            }
4051        }
4052        return writer.write_all(&translated[..wp]);
4053    }
4054
4055    // OOM-safe chunked translate+squeeze for files > SINGLE_ALLOC_LIMIT.
4056    // 8MB matches other mmap paths; bounded heap even for multi-GB files.
4057    const CHUNK: usize = 8 * 1024 * 1024;
4058    let mut last_squeezed: u16 = 256;
4059    let mut buf = alloc_uninit_vec(CHUNK);
4060    for chunk in data.chunks(CHUNK) {
4061        translate_to(chunk, &mut buf[..chunk.len()], &table);
4062        let mut wp = 0;
4063        for i in 0..chunk.len() {
4064            let b = buf[i];
4065            if is_member(&squeeze_set, b) {
4066                if last_squeezed == b as u16 {
4067                    continue;
4068                }
4069                last_squeezed = b as u16;
4070            } else {
4071                last_squeezed = 256;
4072            }
4073            buf[wp] = b;
4074            wp += 1;
4075        }
4076        writer.write_all(&buf[..wp])?;
4077    }
4078    Ok(())
4079}
4080
4081/// Delete from mmap'd byte slice.
4082/// Uses density heuristic for zero-copy writev vs chunked compact.
4083pub fn delete_mmap(delete_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4084    if delete_chars.len() == 1 {
4085        return delete_single_char_mmap(delete_chars[0], data, writer);
4086    }
4087    if delete_chars.len() <= 3 {
4088        return delete_multi_memchr_mmap(delete_chars, data, writer);
4089    }
4090
4091    // SIMD fast path for contiguous ranges (digits, a-z, A-Z, etc.)
4092    if let Some((lo, hi)) = detect_delete_range(delete_chars) {
4093        return delete_range_mmap(data, writer, lo, hi);
4094    }
4095
4096    let member = build_member_set(delete_chars);
4097
4098    // Heuristic: estimate total delete positions. Zero-copy writev is only efficient
4099    // when all gaps fit in a single writev call (< MAX_IOV/2 entries). With uniform
4100    // distribution, each delete creates an IoSlice entry. For many deletes (> 512),
4101    // multiple writev calls are needed, and the compact approach is faster.
4102    let sample_size = data.len().min(1024);
4103    let sample_deletes = data[..sample_size]
4104        .iter()
4105        .filter(|&&b| is_member(&member, b))
4106        .count();
4107    let estimated_deletes = if sample_size > 0 {
4108        data.len() * sample_deletes / sample_size
4109    } else {
4110        data.len()
4111    };
4112
4113    if estimated_deletes < MAX_IOV / 2 {
4114        return delete_bitset_zerocopy(data, &member, writer);
4115    }
4116
4117    // Streaming compact: 8MB buffer for fewer write() syscalls.
4118    const COMPACT_BUF: usize = 8 * 1024 * 1024;
4119    let mut outbuf = alloc_uninit_vec(COMPACT_BUF);
4120    for chunk in data.chunks(COMPACT_BUF) {
4121        let out_pos = delete_chunk_bitset_into(chunk, &member, &mut outbuf);
4122        if out_pos > 0 {
4123            writer.write_all(&outbuf[..out_pos])?;
4124        }
4125    }
4126    Ok(())
4127}
4128
4129/// SIMD range delete for mmap data.
4130/// Uses a density heuristic: for sparse deletes (< 15%), uses zero-copy writev
4131/// directly from mmap data (no output buffer allocation). For dense deletes,
4132/// uses SIMD compact into a pre-allocated buffer.
4133fn delete_range_mmap(data: &[u8], writer: &mut impl Write, lo: u8, hi: u8) -> io::Result<()> {
4134    // Sample first 1024 bytes to estimate delete density
4135    let sample_size = data.len().min(1024);
4136    let sample_deletes = data[..sample_size]
4137        .iter()
4138        .filter(|&&b| b >= lo && b <= hi)
4139        .count();
4140    // Estimate expected number of delete positions (IoSlice entries for zero-copy).
4141    // Each delete creates an IoSlice entry. With MAX_IOV=1024 per writev,
4142    // if estimated_deletes > MAX_IOV/2, the writev overhead from multiple syscalls
4143    // exceeds the compact approach cost. Only use zero-copy when all gaps fit in
4144    // a single writev call.
4145    let estimated_deletes = if sample_size > 0 {
4146        data.len() * sample_deletes / sample_size
4147    } else {
4148        data.len()
4149    };
4150    if estimated_deletes < MAX_IOV / 2 {
4151        return delete_range_mmap_zerocopy(data, writer, lo, hi);
4152    }
4153
4154    // Streaming compact: 8MB buffer for fewer write() syscalls.
4155    const CHUNK: usize = 8 * 1024 * 1024;
4156    let mut outbuf = alloc_uninit_vec(CHUNK);
4157    for chunk in data.chunks(CHUNK) {
4158        let kept = delete_range_chunk(chunk, &mut outbuf[..chunk.len()], lo, hi);
4159        writer.write_all(&outbuf[..kept])?;
4160    }
4161    Ok(())
4162}
4163
4164/// Zero-copy range delete for mmap data: SIMD-scans for bytes in [lo..=hi],
4165/// builds IoSlice entries pointing to the gaps between deleted ranges in the
4166/// original mmap data, and writes using writev. No output buffer allocation.
4167/// For 10MB text with 4% digits: ~1.5ms vs ~4ms for the compact approach.
4168fn delete_range_mmap_zerocopy(
4169    data: &[u8],
4170    writer: &mut impl Write,
4171    lo: u8,
4172    hi: u8,
4173) -> io::Result<()> {
4174    #[cfg(target_arch = "x86_64")]
4175    {
4176        if get_simd_level() >= 3 {
4177            return unsafe { delete_range_zerocopy_avx2(data, writer, lo, hi) };
4178        }
4179        if get_simd_level() >= 2 {
4180            return unsafe { delete_range_zerocopy_sse2(data, writer, lo, hi) };
4181        }
4182    }
4183
4184    #[cfg(target_arch = "aarch64")]
4185    {
4186        return unsafe { delete_range_zerocopy_neon(data, writer, lo, hi) };
4187    }
4188
4189    // Scalar fallback: byte-by-byte scan with IoSlice batching
4190    #[allow(unreachable_code)]
4191    delete_range_zerocopy_scalar(data, writer, lo, hi)
4192}
4193
4194/// Scalar zero-copy range delete: byte-by-byte scan with IoSlice batching.
4195/// Used as fallback when SIMD is unavailable.
4196fn delete_range_zerocopy_scalar(
4197    data: &[u8],
4198    writer: &mut impl Write,
4199    lo: u8,
4200    hi: u8,
4201) -> io::Result<()> {
4202    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4203    let len = data.len();
4204    let mut run_start: usize = 0;
4205    let mut i: usize = 0;
4206
4207    while i < len {
4208        let b = unsafe { *data.get_unchecked(i) };
4209        if b >= lo && b <= hi {
4210            if i > run_start {
4211                iov.push(std::io::IoSlice::new(&data[run_start..i]));
4212                if iov.len() >= MAX_IOV {
4213                    write_ioslices(writer, &iov)?;
4214                    iov.clear();
4215                }
4216            }
4217            run_start = i + 1;
4218        }
4219        i += 1;
4220    }
4221    if run_start < len {
4222        iov.push(std::io::IoSlice::new(&data[run_start..]));
4223    }
4224    if !iov.is_empty() {
4225        write_ioslices(writer, &iov)?;
4226    }
4227    Ok(())
4228}
4229
4230/// AVX2 zero-copy range delete: scans 32 bytes at a time using SIMD range
4231/// comparison, then iterates only the delete positions from the bitmask.
4232/// Blocks with no deletes (common for sparse data) skip with zero per-byte work.
4233#[cfg(target_arch = "x86_64")]
4234#[target_feature(enable = "avx2")]
4235unsafe fn delete_range_zerocopy_avx2(
4236    data: &[u8],
4237    writer: &mut impl Write,
4238    lo: u8,
4239    hi: u8,
4240) -> io::Result<()> {
4241    use std::arch::x86_64::*;
4242
4243    unsafe {
4244        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4245        let len = data.len();
4246        let mut run_start: usize = 0;
4247        let mut ri: usize = 0;
4248
4249        let range = hi - lo;
4250        let bias_v = _mm256_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4251        let threshold_v = _mm256_set1_epi8(0x80u8.wrapping_add(range) as i8);
4252        let zero = _mm256_setzero_si256();
4253
4254        while ri + 32 <= len {
4255            let input = _mm256_loadu_si256(data.as_ptr().add(ri) as *const _);
4256            let biased = _mm256_add_epi8(input, bias_v);
4257            let gt = _mm256_cmpgt_epi8(biased, threshold_v);
4258            let in_range = _mm256_cmpeq_epi8(gt, zero);
4259            let del_mask = _mm256_movemask_epi8(in_range) as u32;
4260
4261            if del_mask == 0 {
4262                // No bytes to delete — run continues
4263                ri += 32;
4264                continue;
4265            }
4266
4267            // Process each deleted byte position from the bitmask
4268            let mut m = del_mask;
4269            while m != 0 {
4270                let bit = m.trailing_zeros() as usize;
4271                let abs_pos = ri + bit;
4272                if abs_pos > run_start {
4273                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4274                    if iov.len() >= MAX_IOV {
4275                        write_ioslices(writer, &iov)?;
4276                        iov.clear();
4277                    }
4278                }
4279                run_start = abs_pos + 1;
4280                m &= m - 1; // clear lowest set bit (blsr)
4281            }
4282
4283            ri += 32;
4284        }
4285
4286        // Scalar tail
4287        while ri < len {
4288            let b = *data.get_unchecked(ri);
4289            if b >= lo && b <= hi {
4290                if ri > run_start {
4291                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4292                    if iov.len() >= MAX_IOV {
4293                        write_ioslices(writer, &iov)?;
4294                        iov.clear();
4295                    }
4296                }
4297                run_start = ri + 1;
4298            }
4299            ri += 1;
4300        }
4301
4302        if run_start < len {
4303            iov.push(std::io::IoSlice::new(&data[run_start..]));
4304        }
4305        if !iov.is_empty() {
4306            write_ioslices(writer, &iov)?;
4307        }
4308        Ok(())
4309    }
4310}
4311
4312/// SSE2 zero-copy range delete: same approach as AVX2 but with 16-byte blocks.
4313#[cfg(target_arch = "x86_64")]
4314#[target_feature(enable = "sse2")]
4315unsafe fn delete_range_zerocopy_sse2(
4316    data: &[u8],
4317    writer: &mut impl Write,
4318    lo: u8,
4319    hi: u8,
4320) -> io::Result<()> {
4321    use std::arch::x86_64::*;
4322
4323    unsafe {
4324        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4325        let len = data.len();
4326        let mut run_start: usize = 0;
4327        let mut ri: usize = 0;
4328
4329        let range = hi - lo;
4330        let bias_v = _mm_set1_epi8(0x80u8.wrapping_sub(lo) as i8);
4331        let threshold_v = _mm_set1_epi8(0x80u8.wrapping_add(range) as i8);
4332        let zero = _mm_setzero_si128();
4333
4334        while ri + 16 <= len {
4335            let input = _mm_loadu_si128(data.as_ptr().add(ri) as *const _);
4336            let biased = _mm_add_epi8(input, bias_v);
4337            let gt = _mm_cmpgt_epi8(biased, threshold_v);
4338            let in_range = _mm_cmpeq_epi8(gt, zero);
4339            let del_mask = _mm_movemask_epi8(in_range) as u32 & 0xFFFF;
4340
4341            if del_mask == 0 {
4342                ri += 16;
4343                continue;
4344            }
4345
4346            let mut m = del_mask;
4347            while m != 0 {
4348                let bit = m.trailing_zeros() as usize;
4349                let abs_pos = ri + bit;
4350                if abs_pos > run_start {
4351                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4352                    if iov.len() >= MAX_IOV {
4353                        write_ioslices(writer, &iov)?;
4354                        iov.clear();
4355                    }
4356                }
4357                run_start = abs_pos + 1;
4358                m &= m - 1;
4359            }
4360
4361            ri += 16;
4362        }
4363
4364        while ri < len {
4365            let b = *data.get_unchecked(ri);
4366            if b >= lo && b <= hi {
4367                if ri > run_start {
4368                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4369                    if iov.len() >= MAX_IOV {
4370                        write_ioslices(writer, &iov)?;
4371                        iov.clear();
4372                    }
4373                }
4374                run_start = ri + 1;
4375            }
4376            ri += 1;
4377        }
4378
4379        if run_start < len {
4380            iov.push(std::io::IoSlice::new(&data[run_start..]));
4381        }
4382        if !iov.is_empty() {
4383            write_ioslices(writer, &iov)?;
4384        }
4385        Ok(())
4386    }
4387}
4388
4389/// NEON zero-copy range delete for aarch64: scans 16 bytes at a time using
4390/// NEON unsigned comparison, creates bitmask via pairwise narrowing, then
4391/// iterates delete positions from the bitmask.
4392#[cfg(target_arch = "aarch64")]
4393#[target_feature(enable = "neon")]
4394unsafe fn delete_range_zerocopy_neon(
4395    data: &[u8],
4396    writer: &mut impl Write,
4397    lo: u8,
4398    hi: u8,
4399) -> io::Result<()> {
4400    use std::arch::aarch64::*;
4401
4402    unsafe {
4403        let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4404        let len = data.len();
4405        let mut run_start: usize = 0;
4406        let mut ri: usize = 0;
4407
4408        let lo_v = vdupq_n_u8(lo);
4409        let hi_v = vdupq_n_u8(hi);
4410        // Bit position mask for extracting bitmask from comparison results
4411        let bit_mask: [u8; 16] = [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
4412        let bit_mask_v = vld1q_u8(bit_mask.as_ptr());
4413
4414        while ri + 16 <= len {
4415            let input = vld1q_u8(data.as_ptr().add(ri));
4416            // in_range = 0xFF where lo <= byte <= hi
4417            let ge_lo = vcgeq_u8(input, lo_v);
4418            let le_hi = vcleq_u8(input, hi_v);
4419            let in_range = vandq_u8(ge_lo, le_hi);
4420
4421            // Create 16-bit bitmask: reduce 16 bytes to 2 bytes
4422            let bits = vandq_u8(in_range, bit_mask_v);
4423            let pair = vpaddlq_u8(bits); // u8→u16 pairwise add
4424            let quad = vpaddlq_u16(pair); // u16→u32
4425            let octet = vpaddlq_u32(quad); // u32→u64
4426            let mask_lo = vgetq_lane_u64::<0>(octet) as u8;
4427            let mask_hi = vgetq_lane_u64::<1>(octet) as u8;
4428            let del_mask = (mask_hi as u16) << 8 | mask_lo as u16;
4429
4430            if del_mask == 0 {
4431                // No bytes to delete — run continues
4432                ri += 16;
4433                continue;
4434            }
4435
4436            // Process each deleted byte position
4437            let mut m = del_mask;
4438            while m != 0 {
4439                let bit = m.trailing_zeros() as usize;
4440                let abs_pos = ri + bit;
4441                if abs_pos > run_start {
4442                    iov.push(std::io::IoSlice::new(&data[run_start..abs_pos]));
4443                    if iov.len() >= MAX_IOV {
4444                        write_ioslices(writer, &iov)?;
4445                        iov.clear();
4446                    }
4447                }
4448                run_start = abs_pos + 1;
4449                m &= m - 1;
4450            }
4451
4452            ri += 16;
4453        }
4454
4455        // Scalar tail
4456        while ri < len {
4457            let b = *data.get_unchecked(ri);
4458            if b >= lo && b <= hi {
4459                if ri > run_start {
4460                    iov.push(std::io::IoSlice::new(&data[run_start..ri]));
4461                    if iov.len() >= MAX_IOV {
4462                        write_ioslices(writer, &iov)?;
4463                        iov.clear();
4464                    }
4465                }
4466                run_start = ri + 1;
4467            }
4468            ri += 1;
4469        }
4470
4471        if run_start < len {
4472            iov.push(std::io::IoSlice::new(&data[run_start..]));
4473        }
4474        if !iov.is_empty() {
4475            write_ioslices(writer, &iov)?;
4476        }
4477        Ok(())
4478    }
4479}
4480
4481/// Delete bytes from chunk using bitset, writing into pre-allocated buffer.
4482/// Returns number of bytes written.
4483#[inline]
4484fn delete_chunk_bitset_into(chunk: &[u8], member: &[u8; 32], outbuf: &mut [u8]) -> usize {
4485    let len = chunk.len();
4486    let mut out_pos = 0;
4487    let mut i = 0;
4488
4489    while i + 8 <= len {
4490        unsafe {
4491            let b0 = *chunk.get_unchecked(i);
4492            let b1 = *chunk.get_unchecked(i + 1);
4493            let b2 = *chunk.get_unchecked(i + 2);
4494            let b3 = *chunk.get_unchecked(i + 3);
4495            let b4 = *chunk.get_unchecked(i + 4);
4496            let b5 = *chunk.get_unchecked(i + 5);
4497            let b6 = *chunk.get_unchecked(i + 6);
4498            let b7 = *chunk.get_unchecked(i + 7);
4499
4500            *outbuf.get_unchecked_mut(out_pos) = b0;
4501            out_pos += !is_member(member, b0) as usize;
4502            *outbuf.get_unchecked_mut(out_pos) = b1;
4503            out_pos += !is_member(member, b1) as usize;
4504            *outbuf.get_unchecked_mut(out_pos) = b2;
4505            out_pos += !is_member(member, b2) as usize;
4506            *outbuf.get_unchecked_mut(out_pos) = b3;
4507            out_pos += !is_member(member, b3) as usize;
4508            *outbuf.get_unchecked_mut(out_pos) = b4;
4509            out_pos += !is_member(member, b4) as usize;
4510            *outbuf.get_unchecked_mut(out_pos) = b5;
4511            out_pos += !is_member(member, b5) as usize;
4512            *outbuf.get_unchecked_mut(out_pos) = b6;
4513            out_pos += !is_member(member, b6) as usize;
4514            *outbuf.get_unchecked_mut(out_pos) = b7;
4515            out_pos += !is_member(member, b7) as usize;
4516        }
4517        i += 8;
4518    }
4519
4520    while i < len {
4521        unsafe {
4522            let b = *chunk.get_unchecked(i);
4523            *outbuf.get_unchecked_mut(out_pos) = b;
4524            out_pos += !is_member(member, b) as usize;
4525        }
4526        i += 1;
4527    }
4528
4529    out_pos
4530}
4531
4532/// Zero-copy delete for general bitset: scan for runs of kept bytes,
4533/// build IoSlice entries pointing directly into the source data.
4534/// No allocation for output data — just ~16 bytes per IoSlice entry.
4535/// Flushes in MAX_IOV-sized batches for efficient writev.
4536fn delete_bitset_zerocopy(
4537    data: &[u8],
4538    member: &[u8; 32],
4539    writer: &mut impl Write,
4540) -> io::Result<()> {
4541    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4542    let len = data.len();
4543    let mut i = 0;
4544    let mut run_start: Option<usize> = None;
4545
4546    while i < len {
4547        let b = unsafe { *data.get_unchecked(i) };
4548        if is_member(member, b) {
4549            // This byte should be deleted
4550            if let Some(rs) = run_start {
4551                iov.push(std::io::IoSlice::new(&data[rs..i]));
4552                run_start = None;
4553                if iov.len() >= MAX_IOV {
4554                    write_ioslices(writer, &iov)?;
4555                    iov.clear();
4556                }
4557            }
4558        } else {
4559            // This byte should be kept
4560            if run_start.is_none() {
4561                run_start = Some(i);
4562            }
4563        }
4564        i += 1;
4565    }
4566    // Flush final run
4567    if let Some(rs) = run_start {
4568        iov.push(std::io::IoSlice::new(&data[rs..]));
4569    }
4570    if !iov.is_empty() {
4571        write_ioslices(writer, &iov)?;
4572    }
4573    Ok(())
4574}
4575
4576fn delete_single_char_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4577    // Streaming zero-copy delete using writev: build IoSlice batches of MAX_IOV
4578    // pointing to gaps between deleted characters, write each batch immediately.
4579    // Avoids allocating the full Vec<IoSlice> for all positions.
4580    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4581    let mut last = 0;
4582    for pos in memchr::memchr_iter(ch, data) {
4583        if pos > last {
4584            iov.push(std::io::IoSlice::new(&data[last..pos]));
4585            if iov.len() >= MAX_IOV {
4586                write_ioslices(writer, &iov)?;
4587                iov.clear();
4588            }
4589        }
4590        last = pos + 1;
4591    }
4592    if last < data.len() {
4593        iov.push(std::io::IoSlice::new(&data[last..]));
4594    }
4595    if !iov.is_empty() {
4596        write_ioslices(writer, &iov)?;
4597    }
4598    Ok(())
4599}
4600
4601fn delete_multi_memchr_mmap(chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4602    let c0 = chars[0];
4603    let c1 = if chars.len() >= 2 { chars[1] } else { 0 };
4604    let c2 = if chars.len() >= 3 { chars[2] } else { 0 };
4605    let is_three = chars.len() >= 3;
4606
4607    // Streaming zero-copy delete: batch IoSlice entries and write in groups of MAX_IOV.
4608    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(MAX_IOV);
4609    let mut last = 0;
4610
4611    macro_rules! process_pos {
4612        ($pos:expr) => {
4613            if $pos > last {
4614                iov.push(std::io::IoSlice::new(&data[last..$pos]));
4615                if iov.len() >= MAX_IOV {
4616                    write_ioslices(writer, &iov)?;
4617                    iov.clear();
4618                }
4619            }
4620            last = $pos + 1;
4621        };
4622    }
4623
4624    if is_three {
4625        for pos in memchr::memchr3_iter(c0, c1, c2, data) {
4626            process_pos!(pos);
4627        }
4628    } else {
4629        for pos in memchr::memchr2_iter(c0, c1, data) {
4630            process_pos!(pos);
4631        }
4632    }
4633    if last < data.len() {
4634        iov.push(std::io::IoSlice::new(&data[last..]));
4635    }
4636    if !iov.is_empty() {
4637        write_ioslices(writer, &iov)?;
4638    }
4639    Ok(())
4640}
4641
4642/// Delete + squeeze from mmap'd byte slice.
4643///
4644/// For data <= 16MB: delete+squeeze into one buffer, one write syscall.
4645/// For data > 16MB: chunked approach to limit memory.
4646pub fn delete_squeeze_mmap(
4647    delete_chars: &[u8],
4648    squeeze_chars: &[u8],
4649    data: &[u8],
4650    writer: &mut impl Write,
4651) -> io::Result<()> {
4652    let delete_set = build_member_set(delete_chars);
4653    let squeeze_set = build_member_set(squeeze_chars);
4654
4655    if data.len() <= SINGLE_ALLOC_LIMIT {
4656        // Single-allocation delete+squeeze: full-size buffer, single write_all.
4657        let mut outbuf = alloc_uninit_vec(data.len());
4658        let mut last_squeezed: u16 = 256;
4659        let mut out_pos = 0;
4660
4661        for &b in data.iter() {
4662            if is_member(&delete_set, b) {
4663                continue;
4664            }
4665            if is_member(&squeeze_set, b) {
4666                if last_squeezed == b as u16 {
4667                    continue;
4668                }
4669                last_squeezed = b as u16;
4670            } else {
4671                last_squeezed = 256;
4672            }
4673            unsafe {
4674                *outbuf.get_unchecked_mut(out_pos) = b;
4675            }
4676            out_pos += 1;
4677        }
4678        return writer.write_all(&outbuf[..out_pos]);
4679    }
4680
4681    // OOM-safe chunked delete+squeeze for files > SINGLE_ALLOC_LIMIT.
4682    const CHUNK: usize = 8 * 1024 * 1024;
4683    let mut outbuf = alloc_uninit_vec(CHUNK);
4684    let mut last_squeezed: u16 = 256;
4685    for chunk in data.chunks(CHUNK) {
4686        let mut out_pos = 0;
4687        for &b in chunk.iter() {
4688            if is_member(&delete_set, b) {
4689                continue;
4690            }
4691            if is_member(&squeeze_set, b) {
4692                if last_squeezed == b as u16 {
4693                    continue;
4694                }
4695                last_squeezed = b as u16;
4696            } else {
4697                last_squeezed = 256;
4698            }
4699            outbuf[out_pos] = b;
4700            out_pos += 1;
4701        }
4702        writer.write_all(&outbuf[..out_pos])?;
4703    }
4704    Ok(())
4705}
4706
4707/// Squeeze from mmap'd byte slice.
4708/// Single-threaded — squeeze is inherently sequential (boundary state).
4709pub fn squeeze_mmap(squeeze_chars: &[u8], data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4710    if squeeze_chars.len() == 1 {
4711        return squeeze_single_mmap(squeeze_chars[0], data, writer);
4712    }
4713    if squeeze_chars.len() == 2 {
4714        return squeeze_multi_mmap::<2>(squeeze_chars, data, writer);
4715    }
4716    if squeeze_chars.len() == 3 {
4717        return squeeze_multi_mmap::<3>(squeeze_chars, data, writer);
4718    }
4719
4720    let member = build_member_set(squeeze_chars);
4721
4722    if data.len() <= SINGLE_ALLOC_LIMIT {
4723        // Single-allocation squeeze: full-size buffer, single write_all.
4724        let mut outbuf = alloc_uninit_vec(data.len());
4725        let len = data.len();
4726        let mut wp = 0;
4727        let mut i = 0;
4728        let mut last_squeezed: u16 = 256;
4729
4730        unsafe {
4731            let inp = data.as_ptr();
4732            let outp = outbuf.as_mut_ptr();
4733
4734            while i < len {
4735                let b = *inp.add(i);
4736                if is_member(&member, b) {
4737                    if last_squeezed != b as u16 {
4738                        *outp.add(wp) = b;
4739                        wp += 1;
4740                        last_squeezed = b as u16;
4741                    }
4742                    i += 1;
4743                    while i < len && *inp.add(i) == b {
4744                        i += 1;
4745                    }
4746                } else {
4747                    last_squeezed = 256;
4748                    *outp.add(wp) = b;
4749                    wp += 1;
4750                    i += 1;
4751                }
4752            }
4753        }
4754        return writer.write_all(&outbuf[..wp]);
4755    }
4756
4757    // OOM-safe chunked squeeze for files > SINGLE_ALLOC_LIMIT.
4758    const CHUNK: usize = 8 * 1024 * 1024;
4759    let mut outbuf = alloc_uninit_vec(CHUNK);
4760    let mut last_squeezed: u16 = 256;
4761    for chunk in data.chunks(CHUNK) {
4762        let mut wp = 0;
4763        for &b in chunk.iter() {
4764            if is_member(&member, b) {
4765                if last_squeezed != b as u16 {
4766                    outbuf[wp] = b;
4767                    wp += 1;
4768                    last_squeezed = b as u16;
4769                }
4770            } else {
4771                last_squeezed = 256;
4772                outbuf[wp] = b;
4773                wp += 1;
4774            }
4775        }
4776        writer.write_all(&outbuf[..wp])?;
4777    }
4778    Ok(())
4779}
4780
4781fn squeeze_multi_mmap<const N: usize>(
4782    chars: &[u8],
4783    data: &[u8],
4784    writer: &mut impl Write,
4785) -> io::Result<()> {
4786    // Zero-copy writev: build IoSlice entries pointing directly into
4787    // the original mmap'd data, keeping one byte per run of squeezable chars.
4788    // Each IoSlice points at the gap between squeeze points (inclusive of
4789    // the first byte of a run) — no data is copied.
4790    let single = [chars[0]; 1]; // scratch for emitting single squeeze byte
4791    let _ = single;
4792    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(1024);
4793    let mut cursor = 0;
4794    let mut last_squeezed: u16 = 256;
4795
4796    macro_rules! find_next {
4797        ($data:expr) => {
4798            if N == 2 {
4799                memchr::memchr2(chars[0], chars[1], $data)
4800            } else {
4801                memchr::memchr3(chars[0], chars[1], chars[2], $data)
4802            }
4803        };
4804    }
4805
4806    while cursor < data.len() {
4807        match find_next!(&data[cursor..]) {
4808            Some(offset) => {
4809                let pos = cursor + offset;
4810                let b = data[pos];
4811                // Emit gap before squeeze point
4812                if pos > cursor {
4813                    iov.push(std::io::IoSlice::new(&data[cursor..pos]));
4814                    last_squeezed = 256;
4815                }
4816                // Emit single byte if not duplicate
4817                if last_squeezed != b as u16 {
4818                    // Point at the byte in the original data (zero-copy)
4819                    iov.push(std::io::IoSlice::new(&data[pos..pos + 1]));
4820                    last_squeezed = b as u16;
4821                }
4822                // Skip the run of same byte
4823                let mut skip = pos + 1;
4824                while skip < data.len() && data[skip] == b {
4825                    skip += 1;
4826                }
4827                cursor = skip;
4828                // Flush when approaching MAX_IOV
4829                if iov.len() >= MAX_IOV {
4830                    write_ioslices(writer, &iov)?;
4831                    iov.clear();
4832                }
4833            }
4834            None => {
4835                if cursor < data.len() {
4836                    iov.push(std::io::IoSlice::new(&data[cursor..]));
4837                }
4838                break;
4839            }
4840        }
4841    }
4842    if !iov.is_empty() {
4843        write_ioslices(writer, &iov)?;
4844    }
4845    Ok(())
4846}
4847
4848fn squeeze_single_mmap(ch: u8, data: &[u8], writer: &mut impl Write) -> io::Result<()> {
4849    if data.is_empty() {
4850        return Ok(());
4851    }
4852
4853    // Quick check: no consecutive pairs means no squeezing needed
4854    let pair = [ch, ch];
4855    if memchr::memmem::find(data, &pair).is_none() {
4856        return writer.write_all(data);
4857    }
4858
4859    // Zero-copy writev approach: build IoSlice entries pointing directly into
4860    // the original mmap'd data, skipping duplicate bytes in runs.
4861    // For `tr -s ' '` on 10MB with ~5K squeeze points:
4862    //   - ~10K IoSlice entries (one per gap + one per squeeze point)
4863    //   - ~10 writev syscalls (at 1024 entries per batch)
4864    //   - Zero data copy — kernel reads directly from mmap pages
4865    let finder = memchr::memmem::Finder::new(&pair);
4866    let mut iov: Vec<std::io::IoSlice> = Vec::with_capacity(2048);
4867    let mut cursor = 0;
4868
4869    while cursor < data.len() {
4870        match finder.find(&data[cursor..]) {
4871            Some(offset) => {
4872                let pair_pos = cursor + offset;
4873                // Include everything up to and including the first byte of the pair
4874                let seg_end = pair_pos + 1;
4875                if seg_end > cursor {
4876                    iov.push(std::io::IoSlice::new(&data[cursor..seg_end]));
4877                }
4878                // Skip all remaining consecutive ch bytes (the run)
4879                let mut skip = seg_end;
4880                while skip < data.len() && data[skip] == ch {
4881                    skip += 1;
4882                }
4883                cursor = skip;
4884                // Flush when approaching MAX_IOV
4885                if iov.len() >= MAX_IOV {
4886                    write_ioslices(writer, &iov)?;
4887                    iov.clear();
4888                }
4889            }
4890            None => {
4891                // No more pairs — emit remainder
4892                if cursor < data.len() {
4893                    iov.push(std::io::IoSlice::new(&data[cursor..]));
4894                }
4895                break;
4896            }
4897        }
4898    }
4899
4900    if !iov.is_empty() {
4901        write_ioslices(writer, &iov)?;
4902    }
4903    Ok(())
4904}